From c70be12bfdf4f9c86608fb44ed3a028a15f5c61b Mon Sep 17 00:00:00 2001 From: asminkarki012 Date: Fri, 28 Mar 2025 22:46:11 +0545 Subject: [PATCH 1/2] fix[csv_parser]:missing header --- application/parser/file/tabular_parser.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py index b2dbd193..079fb475 100644 --- a/application/parser/file/tabular_parser.py +++ b/application/parser/file/tabular_parser.py @@ -104,9 +104,13 @@ class PandasCSVParser(BaseParser): raise ValueError("pandas module is required to read CSV files.") df = pd.read_csv(file, **self._pandas_config) + headers = df.columns.tolist() text_list = df.apply( - lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 + lambda row: self._col_joiner.join( + [f"{headers[i]}: {str(val)}" for i, val in enumerate(row)] + ), + axis=1, ).tolist() if self._concat_rows: @@ -169,12 +173,16 @@ class ExcelParser(BaseParser): raise ValueError("pandas module is required to read Excel files.") df = pd.read_excel(file, **self._pandas_config) + headers = df.columns.tolist() text_list = df.apply( - lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 + lambda row: self._col_joiner.join( + [f"{headers[i]}: {str(val)}" for i, val in enumerate(row)] + ), + axis=1, ).tolist() if self._concat_rows: return (self._row_joiner).join(text_list) else: - return text_list \ No newline at end of file + return text_list From 57a6fb31b2f252e0414defe464f0ccc5f8ea4864 Mon Sep 17 00:00:00 2001 From: Pavel Date: Mon, 31 Mar 2025 22:28:04 +0400 Subject: [PATCH 2/2] periodic header injection --- application/parser/file/tabular_parser.py | 79 ++++++++++++++++------- 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py index 079fb475..40971b3c 100644 --- a/application/parser/file/tabular_parser.py +++ b/application/parser/file/tabular_parser.py @@ -73,7 +73,13 @@ class PandasCSVParser(BaseParser): for more information. Set to empty dict by default, this means pandas will try to figure out the separators, table head, etc. on its own. - + + header_period (int): Controls how headers are included in output: + - 0: Headers only at the beginning + - 1: Headers in every row + - N > 1: Headers every N rows + + header_prefix (str): Prefix for header rows. Default is "HEADERS: ". """ def __init__( @@ -83,6 +89,8 @@ class PandasCSVParser(BaseParser): col_joiner: str = ", ", row_joiner: str = "\n", pandas_config: dict = {}, + header_period: int = 20, + header_prefix: str = "HEADERS: ", **kwargs: Any ) -> None: """Init params.""" @@ -91,6 +99,8 @@ class PandasCSVParser(BaseParser): self._col_joiner = col_joiner self._row_joiner = row_joiner self._pandas_config = pandas_config + self._header_period = header_period + self._header_prefix = header_prefix def _init_parser(self) -> Dict: """Init parser.""" @@ -105,18 +115,25 @@ class PandasCSVParser(BaseParser): df = pd.read_csv(file, **self._pandas_config) headers = df.columns.tolist() + header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}" - text_list = df.apply( - lambda row: self._col_joiner.join( - [f"{headers[i]}: {str(val)}" for i, val in enumerate(row)] - ), - axis=1, - ).tolist() + if not self._concat_rows: + return df.apply( + lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 + ).tolist() + + text_list = [] + if self._header_period != 1: + text_list.append(header_row) + + for i, row in df.iterrows(): + if (self._header_period > 1 and i > 0 and i % self._header_period == 0): + text_list.append(header_row) + text_list.append(self._col_joiner.join(row.astype(str).tolist())) + if self._header_period == 1 and i < len(df) - 1: + text_list.append(header_row) - if self._concat_rows: - return (self._row_joiner).join(text_list) - else: - return text_list + return self._row_joiner.join(text_list) class ExcelParser(BaseParser): @@ -142,7 +159,13 @@ class ExcelParser(BaseParser): for more information. Set to empty dict by default, this means pandas will try to figure out the table structure on its own. - + + header_period (int): Controls how headers are included in output: + - 0: Headers only at the beginning (default) + - 1: Headers in every row + - N > 1: Headers every N rows + + header_prefix (str): Prefix for header rows. Default is "HEADERS: ". """ def __init__( @@ -152,6 +175,8 @@ class ExcelParser(BaseParser): col_joiner: str = ", ", row_joiner: str = "\n", pandas_config: dict = {}, + header_period: int = 20, + header_prefix: str = "HEADERS: ", **kwargs: Any ) -> None: """Init params.""" @@ -160,6 +185,8 @@ class ExcelParser(BaseParser): self._col_joiner = col_joiner self._row_joiner = row_joiner self._pandas_config = pandas_config + self._header_period = header_period + self._header_prefix = header_prefix def _init_parser(self) -> Dict: """Init parser.""" @@ -174,15 +201,21 @@ class ExcelParser(BaseParser): df = pd.read_excel(file, **self._pandas_config) headers = df.columns.tolist() + header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}" + + if not self._concat_rows: + return df.apply( + lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 + ).tolist() + + text_list = [] + if self._header_period != 1: + text_list.append(header_row) - text_list = df.apply( - lambda row: self._col_joiner.join( - [f"{headers[i]}: {str(val)}" for i, val in enumerate(row)] - ), - axis=1, - ).tolist() - - if self._concat_rows: - return (self._row_joiner).join(text_list) - else: - return text_list + for i, row in df.iterrows(): + if (self._header_period > 1 and i > 0 and i % self._header_period == 0): + text_list.append(header_row) + text_list.append(self._col_joiner.join(row.astype(str).tolist())) + if self._header_period == 1 and i < len(df) - 1: + text_list.append(header_row) + return self._row_joiner.join(text_list) \ No newline at end of file