Merge branch 'main' of https://github.com/arc53/DocsGPT

2026-01-26 00:40:32 +00:00 · 2025-04-05 01:25:36 +00:00
parent 29c899627e 3227b0e69c
commit bdac2ffa27
1 changed files with 58 additions and 17 deletions
--- a/application/parser/file/tabular_parser.py
+++ b/application/parser/file/tabular_parser.py
@@ -73,7 +73,13 @@ class PandasCSVParser(BaseParser):
            for more information.
            Set to empty dict by default, this means pandas will try to figure
            out the separators, table head, etc. on its own.
-
+            
+        header_period (int): Controls how headers are included in output:
+            - 0: Headers only at the beginning
+            - 1: Headers in every row
+            - N > 1: Headers every N rows
+            
+        header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
    """

    def __init__(
@@ -83,6 +89,8 @@ class PandasCSVParser(BaseParser):
            col_joiner: str = ", ",
            row_joiner: str = "\n",
            pandas_config: dict = {},
+            header_period: int = 20,
+            header_prefix: str = "HEADERS: ",
            **kwargs: Any
    ) -> None:
        """Init params."""
@@ -91,6 +99,8 @@ class PandasCSVParser(BaseParser):
        self._col_joiner = col_joiner
        self._row_joiner = row_joiner
        self._pandas_config = pandas_config
+        self._header_period = header_period
+        self._header_prefix = header_prefix

    def _init_parser(self) -> Dict:
        """Init parser."""
@@ -104,15 +114,26 @@ class PandasCSVParser(BaseParser):
            raise ValueError("pandas module is required to read CSV files.")

        df = pd.read_csv(file, **self._pandas_config)
+        headers = df.columns.tolist()
+        header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"

-        text_list = df.apply(
-            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
-        ).tolist()
+        if not self._concat_rows:
+            return df.apply(
+                lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+            ).tolist()
+        
+        text_list = []
+        if self._header_period != 1:
+            text_list.append(header_row)
+        
+        for i, row in df.iterrows():
+            if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
+                text_list.append(header_row)
+            text_list.append(self._col_joiner.join(row.astype(str).tolist()))
+            if self._header_period == 1 and i < len(df) - 1:
+                text_list.append(header_row)

-        if self._concat_rows:
-            return (self._row_joiner).join(text_list)
-        else:
-            return text_list
+        return self._row_joiner.join(text_list)


 class ExcelParser(BaseParser):
@@ -138,7 +159,13 @@ class ExcelParser(BaseParser):
            for more information.
            Set to empty dict by default, this means pandas will try to figure
            out the table structure on its own.
-
+            
+        header_period (int): Controls how headers are included in output:
+            - 0: Headers only at the beginning (default)
+            - 1: Headers in every row
+            - N > 1: Headers every N rows
+            
+        header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
    """

    def __init__(
@@ -148,6 +175,8 @@ class ExcelParser(BaseParser):
            col_joiner: str = ", ",
            row_joiner: str = "\n",
            pandas_config: dict = {},
+            header_period: int = 20,
+            header_prefix: str = "HEADERS: ",
            **kwargs: Any
    ) -> None:
        """Init params."""
@@ -156,6 +185,8 @@ class ExcelParser(BaseParser):
        self._col_joiner = col_joiner
        self._row_joiner = row_joiner
        self._pandas_config = pandas_config
+        self._header_period = header_period
+        self._header_prefix = header_prefix

    def _init_parser(self) -> Dict:
        """Init parser."""
@@ -169,12 +200,22 @@ class ExcelParser(BaseParser):
            raise ValueError("pandas module is required to read Excel files.")

        df = pd.read_excel(file, **self._pandas_config)
+        headers = df.columns.tolist()
+        header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
+        
+        if not self._concat_rows:
+            return df.apply(
+                lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+            ).tolist()
+        
+        text_list = []
+        if self._header_period != 1:
+            text_list.append(header_row)

-        text_list = df.apply(
-            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
-        ).tolist()
-
-        if self._concat_rows:
-            return (self._row_joiner).join(text_list)
-        else:
-            return text_list
+        for i, row in df.iterrows():
+            if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
+                text_list.append(header_row)
+            text_list.append(self._col_joiner.join(row.astype(str).tolist()))
+            if self._header_period == 1 and i < len(df) - 1:
+                text_list.append(header_row)
+        return self._row_joiner.join(text_list)