mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
Merge pull request #1722 from asminkarki012/fix/csv-parser-include-headers
fix[csv_parser]:missing header
This commit is contained in:
@@ -73,7 +73,13 @@ class PandasCSVParser(BaseParser):
|
|||||||
for more information.
|
for more information.
|
||||||
Set to empty dict by default, this means pandas will try to figure
|
Set to empty dict by default, this means pandas will try to figure
|
||||||
out the separators, table head, etc. on its own.
|
out the separators, table head, etc. on its own.
|
||||||
|
|
||||||
|
header_period (int): Controls how headers are included in output:
|
||||||
|
- 0: Headers only at the beginning
|
||||||
|
- 1: Headers in every row
|
||||||
|
- N > 1: Headers every N rows
|
||||||
|
|
||||||
|
header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -83,6 +89,8 @@ class PandasCSVParser(BaseParser):
|
|||||||
col_joiner: str = ", ",
|
col_joiner: str = ", ",
|
||||||
row_joiner: str = "\n",
|
row_joiner: str = "\n",
|
||||||
pandas_config: dict = {},
|
pandas_config: dict = {},
|
||||||
|
header_period: int = 20,
|
||||||
|
header_prefix: str = "HEADERS: ",
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Init params."""
|
"""Init params."""
|
||||||
@@ -91,6 +99,8 @@ class PandasCSVParser(BaseParser):
|
|||||||
self._col_joiner = col_joiner
|
self._col_joiner = col_joiner
|
||||||
self._row_joiner = row_joiner
|
self._row_joiner = row_joiner
|
||||||
self._pandas_config = pandas_config
|
self._pandas_config = pandas_config
|
||||||
|
self._header_period = header_period
|
||||||
|
self._header_prefix = header_prefix
|
||||||
|
|
||||||
def _init_parser(self) -> Dict:
|
def _init_parser(self) -> Dict:
|
||||||
"""Init parser."""
|
"""Init parser."""
|
||||||
@@ -104,15 +114,26 @@ class PandasCSVParser(BaseParser):
|
|||||||
raise ValueError("pandas module is required to read CSV files.")
|
raise ValueError("pandas module is required to read CSV files.")
|
||||||
|
|
||||||
df = pd.read_csv(file, **self._pandas_config)
|
df = pd.read_csv(file, **self._pandas_config)
|
||||||
|
headers = df.columns.tolist()
|
||||||
|
header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
|
||||||
|
|
||||||
text_list = df.apply(
|
if not self._concat_rows:
|
||||||
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
|
return df.apply(
|
||||||
).tolist()
|
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
|
||||||
|
).tolist()
|
||||||
|
|
||||||
|
text_list = []
|
||||||
|
if self._header_period != 1:
|
||||||
|
text_list.append(header_row)
|
||||||
|
|
||||||
|
for i, row in df.iterrows():
|
||||||
|
if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
|
||||||
|
text_list.append(header_row)
|
||||||
|
text_list.append(self._col_joiner.join(row.astype(str).tolist()))
|
||||||
|
if self._header_period == 1 and i < len(df) - 1:
|
||||||
|
text_list.append(header_row)
|
||||||
|
|
||||||
if self._concat_rows:
|
return self._row_joiner.join(text_list)
|
||||||
return (self._row_joiner).join(text_list)
|
|
||||||
else:
|
|
||||||
return text_list
|
|
||||||
|
|
||||||
|
|
||||||
class ExcelParser(BaseParser):
|
class ExcelParser(BaseParser):
|
||||||
@@ -138,7 +159,13 @@ class ExcelParser(BaseParser):
|
|||||||
for more information.
|
for more information.
|
||||||
Set to empty dict by default, this means pandas will try to figure
|
Set to empty dict by default, this means pandas will try to figure
|
||||||
out the table structure on its own.
|
out the table structure on its own.
|
||||||
|
|
||||||
|
header_period (int): Controls how headers are included in output:
|
||||||
|
- 0: Headers only at the beginning (default)
|
||||||
|
- 1: Headers in every row
|
||||||
|
- N > 1: Headers every N rows
|
||||||
|
|
||||||
|
header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -148,6 +175,8 @@ class ExcelParser(BaseParser):
|
|||||||
col_joiner: str = ", ",
|
col_joiner: str = ", ",
|
||||||
row_joiner: str = "\n",
|
row_joiner: str = "\n",
|
||||||
pandas_config: dict = {},
|
pandas_config: dict = {},
|
||||||
|
header_period: int = 20,
|
||||||
|
header_prefix: str = "HEADERS: ",
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Init params."""
|
"""Init params."""
|
||||||
@@ -156,6 +185,8 @@ class ExcelParser(BaseParser):
|
|||||||
self._col_joiner = col_joiner
|
self._col_joiner = col_joiner
|
||||||
self._row_joiner = row_joiner
|
self._row_joiner = row_joiner
|
||||||
self._pandas_config = pandas_config
|
self._pandas_config = pandas_config
|
||||||
|
self._header_period = header_period
|
||||||
|
self._header_prefix = header_prefix
|
||||||
|
|
||||||
def _init_parser(self) -> Dict:
|
def _init_parser(self) -> Dict:
|
||||||
"""Init parser."""
|
"""Init parser."""
|
||||||
@@ -169,12 +200,22 @@ class ExcelParser(BaseParser):
|
|||||||
raise ValueError("pandas module is required to read Excel files.")
|
raise ValueError("pandas module is required to read Excel files.")
|
||||||
|
|
||||||
df = pd.read_excel(file, **self._pandas_config)
|
df = pd.read_excel(file, **self._pandas_config)
|
||||||
|
headers = df.columns.tolist()
|
||||||
|
header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
|
||||||
|
|
||||||
|
if not self._concat_rows:
|
||||||
|
return df.apply(
|
||||||
|
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
|
||||||
|
).tolist()
|
||||||
|
|
||||||
|
text_list = []
|
||||||
|
if self._header_period != 1:
|
||||||
|
text_list.append(header_row)
|
||||||
|
|
||||||
text_list = df.apply(
|
for i, row in df.iterrows():
|
||||||
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
|
if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
|
||||||
).tolist()
|
text_list.append(header_row)
|
||||||
|
text_list.append(self._col_joiner.join(row.astype(str).tolist()))
|
||||||
if self._concat_rows:
|
if self._header_period == 1 and i < len(df) - 1:
|
||||||
return (self._row_joiner).join(text_list)
|
text_list.append(header_row)
|
||||||
else:
|
return self._row_joiner.join(text_list)
|
||||||
return text_list
|
|
||||||
Reference in New Issue
Block a user