Merge pull request #1722 from asminkarki012/fix/csv-parser-include-headers

fix[csv_parser]:missing header
2025-11-29 08:33:20 +00:00 · 2025-04-04 14:53:45 +03:00
parent 94c7bba168 57a6fb31b2
commit 3227b0e69c
1 changed files with 58 additions and 17 deletions
--- a/application/parser/file/tabular_parser.py
+++ b/application/parser/file/tabular_parser.py
@@ -73,7 +73,13 @@ class PandasCSVParser(BaseParser):
            for more information.
            Set to empty dict by default, this means pandas will try to figure
            out the separators, table head, etc. on its own.
-
+            
        header_period (int): Controls how headers are included in output:
            - 0: Headers only at the beginning
            - 1: Headers in every row
            - N > 1: Headers every N rows
        header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
    """
    def __init__(
@@ -83,6 +89,8 @@ class PandasCSVParser(BaseParser):
            col_joiner: str = ", ",
            row_joiner: str = "\n",
            pandas_config: dict = {},
            header_period: int = 20,
            header_prefix: str = "HEADERS: ",
            **kwargs: Any
    ) -> None:
        """Init params."""
@@ -91,6 +99,8 @@ class PandasCSVParser(BaseParser):
        self._col_joiner = col_joiner
        self._row_joiner = row_joiner
        self._pandas_config = pandas_config
        self._header_period = header_period
        self._header_prefix = header_prefix
    def _init_parser(self) -> Dict:
        """Init parser."""
@@ -104,15 +114,26 @@ class PandasCSVParser(BaseParser):
            raise ValueError("pandas module is required to read CSV files.")
        df = pd.read_csv(file, **self._pandas_config)
        headers = df.columns.tolist()
        header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
-        text_list = df.apply(
+        if not self._concat_rows:
-            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+            return df.apply(
-        ).tolist()
+                lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
            ).tolist()
        text_list = []
        if self._header_period != 1:
            text_list.append(header_row)
        for i, row in df.iterrows():
            if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
                text_list.append(header_row)
            text_list.append(self._col_joiner.join(row.astype(str).tolist()))
            if self._header_period == 1 and i < len(df) - 1:
                text_list.append(header_row)
-        if self._concat_rows:
+        return self._row_joiner.join(text_list)
            return (self._row_joiner).join(text_list)
        else:
            return text_list
 class ExcelParser(BaseParser):
@@ -138,7 +159,13 @@ class ExcelParser(BaseParser):
            for more information.
            Set to empty dict by default, this means pandas will try to figure
            out the table structure on its own.
-
+            
        header_period (int): Controls how headers are included in output:
            - 0: Headers only at the beginning (default)
            - 1: Headers in every row
            - N > 1: Headers every N rows
        header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
    """
    def __init__(
@@ -148,6 +175,8 @@ class ExcelParser(BaseParser):
            col_joiner: str = ", ",
            row_joiner: str = "\n",
            pandas_config: dict = {},
            header_period: int = 20,
            header_prefix: str = "HEADERS: ",
            **kwargs: Any
    ) -> None:
        """Init params."""
@@ -156,6 +185,8 @@ class ExcelParser(BaseParser):
        self._col_joiner = col_joiner
        self._row_joiner = row_joiner
        self._pandas_config = pandas_config
        self._header_period = header_period
        self._header_prefix = header_prefix
    def _init_parser(self) -> Dict:
        """Init parser."""
@@ -169,12 +200,22 @@ class ExcelParser(BaseParser):
            raise ValueError("pandas module is required to read Excel files.")
        df = pd.read_excel(file, **self._pandas_config)
        headers = df.columns.tolist()
        header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
        if not self._concat_rows:
            return df.apply(
                lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
            ).tolist()
        text_list = []
        if self._header_period != 1:
            text_list.append(header_row)
-        text_list = df.apply(
+        for i, row in df.iterrows():
-            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+            if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
-        ).tolist()
+                text_list.append(header_row)
-
+            text_list.append(self._col_joiner.join(row.astype(str).tolist()))
-        if self._concat_rows:
+            if self._header_period == 1 and i < len(df) - 1:
-            return (self._row_joiner).join(text_list)
+                text_list.append(header_row)
-        else:
+        return self._row_joiner.join(text_list)
            return text_list