Merge pull request #1184 from Devparihar5/ExcelParser

new: added ExcelParser(tested) to read .xlsx files
2025-11-29 08:33:20 +00:00 · 2024-10-06 23:19:37 +01:00
parent 4895d389e4 09a15e2e59
commit c9e95a9146
6 changed files with 71 additions and 3 deletions
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -10,13 +10,14 @@ from application.parser.file.epub_parser import EpubParser
 from application.parser.file.html_parser import HTMLParser
 from application.parser.file.markdown_parser import MarkdownParser
 from application.parser.file.rst_parser import RstParser
-from application.parser.file.tabular_parser import PandasCSVParser
+from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
 from application.parser.schema.base import Document

 DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
    ".pdf": PDFParser(),
    ".docx": DocxParser(),
    ".csv": PandasCSVParser(),
+    ".xlsx":ExcelParser(),
    ".epub": EpubParser(),
    ".md": MarkdownParser(),
    ".rst": RstParser(),
--- a/application/parser/file/tabular_parser.py
+++ b/application/parser/file/tabular_parser.py
@@ -113,3 +113,68 @@ class PandasCSVParser(BaseParser):
            return (self._row_joiner).join(text_list)
        else:
            return text_list
+
+
+class ExcelParser(BaseParser):
+    r"""Excel (.xlsx) parser.
+
+    Parses Excel files using Pandas `read_excel` function.
+    If special parameters are required, use the `pandas_config` dict.
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+
+        col_joiner (str): Separator to use for joining cols per row.
+            Set to ", " by default.
+
+        row_joiner (str): Separator to use for joining each row.
+            Only used when `concat_rows=True`.
+            Set to "\n" by default.
+
+        pandas_config (dict): Options for the `pandas.read_excel` function call.
+            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
+            for more information.
+            Set to empty dict by default, this means pandas will try to figure
+            out the table structure on its own.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            concat_rows: bool = True,
+            col_joiner: str = ", ",
+            row_joiner: str = "\n",
+            pandas_config: dict = {},
+            **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._col_joiner = col_joiner
+        self._row_joiner = row_joiner
+        self._pandas_config = pandas_config
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file."""
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ValueError("pandas module is required to read Excel files.")
+
+        df = pd.read_excel(file, **self._pandas_config)
+
+        text_list = df.apply(
+            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+        ).tolist()
+
+        if self._concat_rows:
+            return (self._row_joiner).join(text_list)
+        else:
+            return text_list
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -49,6 +49,7 @@ openapi3-parser==1.1.18
 orjson==3.10.7
 packaging==24.1
 pandas==2.2.3
+openpyxl==3.1.5
 pathable==0.4.3
 pillow==10.4.0
 portalocker==2.10.1
--- a/application/vectorstore/faiss.py
+++ b/application/vectorstore/faiss.py
@@ -22,7 +22,7 @@ class FaissStore(BaseVectorStore):
            else:
                self.docsearch = FAISS.load_local(self.path, embeddings, allow_dangerous_deserialization=True)
        except Exception:
-            raise  # Just re-raise the exception without assigning to e
+            raise

        self.assert_embedding_dimensions(embeddings)

--- a/frontend/src/locale/en.json
+++ b/frontend/src/locale/en.json
@@ -79,7 +79,7 @@
      "remote": "Remote",
      "name": "Name",
      "choose": "Choose Files",
-      "info": "Please upload .pdf, .txt, .rst, .csv, .docx, .md, .zip limited to 25mb",
+      "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .zip limited to 25mb",
      "uploadedFiles": "Uploaded Files",
      "cancel": "Cancel",
      "train": "Train",
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -275,6 +275,7 @@ function Upload({
      'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
        ['.docx'],
      'text/csv': ['.csv'],
+      'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
    },
  });