Merge pull request #1722 from asminkarki012/fix/csv-parser-include-headers

fix[csv_parser]:missing header
This commit is contained in:
Alex
2025-04-04 14:53:45 +03:00
committed by GitHub

View File

@@ -73,7 +73,13 @@ class PandasCSVParser(BaseParser):
for more information. for more information.
Set to empty dict by default, this means pandas will try to figure Set to empty dict by default, this means pandas will try to figure
out the separators, table head, etc. on its own. out the separators, table head, etc. on its own.
header_period (int): Controls how headers are included in output:
- 0: Headers only at the beginning
- 1: Headers in every row
- N > 1: Headers every N rows
header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
""" """
def __init__( def __init__(
@@ -83,6 +89,8 @@ class PandasCSVParser(BaseParser):
col_joiner: str = ", ", col_joiner: str = ", ",
row_joiner: str = "\n", row_joiner: str = "\n",
pandas_config: dict = {}, pandas_config: dict = {},
header_period: int = 20,
header_prefix: str = "HEADERS: ",
**kwargs: Any **kwargs: Any
) -> None: ) -> None:
"""Init params.""" """Init params."""
@@ -91,6 +99,8 @@ class PandasCSVParser(BaseParser):
self._col_joiner = col_joiner self._col_joiner = col_joiner
self._row_joiner = row_joiner self._row_joiner = row_joiner
self._pandas_config = pandas_config self._pandas_config = pandas_config
self._header_period = header_period
self._header_prefix = header_prefix
def _init_parser(self) -> Dict: def _init_parser(self) -> Dict:
"""Init parser.""" """Init parser."""
@@ -104,15 +114,26 @@ class PandasCSVParser(BaseParser):
raise ValueError("pandas module is required to read CSV files.") raise ValueError("pandas module is required to read CSV files.")
df = pd.read_csv(file, **self._pandas_config) df = pd.read_csv(file, **self._pandas_config)
headers = df.columns.tolist()
header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
text_list = df.apply( if not self._concat_rows:
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 return df.apply(
).tolist() lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
).tolist()
text_list = []
if self._header_period != 1:
text_list.append(header_row)
for i, row in df.iterrows():
if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
text_list.append(header_row)
text_list.append(self._col_joiner.join(row.astype(str).tolist()))
if self._header_period == 1 and i < len(df) - 1:
text_list.append(header_row)
if self._concat_rows: return self._row_joiner.join(text_list)
return (self._row_joiner).join(text_list)
else:
return text_list
class ExcelParser(BaseParser): class ExcelParser(BaseParser):
@@ -138,7 +159,13 @@ class ExcelParser(BaseParser):
for more information. for more information.
Set to empty dict by default, this means pandas will try to figure Set to empty dict by default, this means pandas will try to figure
out the table structure on its own. out the table structure on its own.
header_period (int): Controls how headers are included in output:
- 0: Headers only at the beginning (default)
- 1: Headers in every row
- N > 1: Headers every N rows
header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
""" """
def __init__( def __init__(
@@ -148,6 +175,8 @@ class ExcelParser(BaseParser):
col_joiner: str = ", ", col_joiner: str = ", ",
row_joiner: str = "\n", row_joiner: str = "\n",
pandas_config: dict = {}, pandas_config: dict = {},
header_period: int = 20,
header_prefix: str = "HEADERS: ",
**kwargs: Any **kwargs: Any
) -> None: ) -> None:
"""Init params.""" """Init params."""
@@ -156,6 +185,8 @@ class ExcelParser(BaseParser):
self._col_joiner = col_joiner self._col_joiner = col_joiner
self._row_joiner = row_joiner self._row_joiner = row_joiner
self._pandas_config = pandas_config self._pandas_config = pandas_config
self._header_period = header_period
self._header_prefix = header_prefix
def _init_parser(self) -> Dict: def _init_parser(self) -> Dict:
"""Init parser.""" """Init parser."""
@@ -169,12 +200,22 @@ class ExcelParser(BaseParser):
raise ValueError("pandas module is required to read Excel files.") raise ValueError("pandas module is required to read Excel files.")
df = pd.read_excel(file, **self._pandas_config) df = pd.read_excel(file, **self._pandas_config)
headers = df.columns.tolist()
header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
if not self._concat_rows:
return df.apply(
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
).tolist()
text_list = []
if self._header_period != 1:
text_list.append(header_row)
text_list = df.apply( for i, row in df.iterrows():
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
).tolist() text_list.append(header_row)
text_list.append(self._col_joiner.join(row.astype(str).tolist()))
if self._concat_rows: if self._header_period == 1 and i < len(df) - 1:
return (self._row_joiner).join(text_list) text_list.append(header_row)
else: return self._row_joiner.join(text_list)
return text_list