new: added ExcelParser(tested) to read .xlsx files

2026-02-11 00:31:02 +00:00 · 2024-10-01 22:03:10 +05:30
parent af1b81097f
commit 7794129929
8 changed files with 116 additions and 27 deletions
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -10,13 +10,14 @@ from application.parser.file.epub_parser import EpubParser
 from application.parser.file.html_parser import HTMLParser
 from application.parser.file.markdown_parser import MarkdownParser
 from application.parser.file.rst_parser import RstParser
-from application.parser.file.tabular_parser import PandasCSVParser
+from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
 from application.parser.schema.base import Document

 DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
    ".pdf": PDFParser(),
    ".docx": DocxParser(),
    ".csv": PandasCSVParser(),
+    ".xlsx":ExcelParser(),
    ".epub": EpubParser(),
    ".md": MarkdownParser(),
    ".rst": RstParser(),
--- a/application/parser/file/tabular_parser.py
+++ b/application/parser/file/tabular_parser.py
@@ -113,3 +113,68 @@ class PandasCSVParser(BaseParser):
            return (self._row_joiner).join(text_list)
        else:
            return text_list
+
+
+class ExcelParser(BaseParser):
+    r"""Excel (.xlsx) parser.
+
+    Parses Excel files using Pandas `read_excel` function.
+    If special parameters are required, use the `pandas_config` dict.
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+
+        col_joiner (str): Separator to use for joining cols per row.
+            Set to ", " by default.
+
+        row_joiner (str): Separator to use for joining each row.
+            Only used when `concat_rows=True`.
+            Set to "\n" by default.
+
+        pandas_config (dict): Options for the `pandas.read_excel` function call.
+            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
+            for more information.
+            Set to empty dict by default, this means pandas will try to figure
+            out the table structure on its own.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            concat_rows: bool = True,
+            col_joiner: str = ", ",
+            row_joiner: str = "\n",
+            pandas_config: dict = {},
+            **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._col_joiner = col_joiner
+        self._row_joiner = row_joiner
+        self._pandas_config = pandas_config
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file."""
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ValueError("pandas module is required to read Excel files.")
+
+        df = pd.read_excel(file, **self._pandas_config)
+
+        text_list = df.apply(
+            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+        ).tolist()
+
+        if self._concat_rows:
+            return (self._row_joiner).join(text_list)
+        else:
+            return text_list
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -49,6 +49,7 @@ openapi3-parser==1.1.18
 orjson==3.10.7
 packaging==24.1
 pandas==2.2.3
+openpyxl==3.1.5
 pathable==0.4.3
 pillow==10.4.0
 portalocker==2.10.1
--- a/application/vectorstore/faiss.py
+++ b/application/vectorstore/faiss.py
@@ -3,30 +3,27 @@ from application.vectorstore.base import BaseVectorStore
 from application.core.settings import settings
 import os

-def get_vectorstore(path):
+def get_vectorstore(path: str) -> str:
    if path:
-        vectorstore = "indexes/"+path
-        vectorstore = os.path.join("application", vectorstore)
+        vectorstore = os.path.join("application", "indexes", path)
    else:
        vectorstore = os.path.join("application")
-
    return vectorstore

 class FaissStore(BaseVectorStore):
-
-    def __init__(self, source_id, embeddings_key, docs_init=None):
+    def __init__(self, source_id: str, embeddings_key: str, docs_init=None):
        super().__init__()
        self.path = get_vectorstore(source_id)
        embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
-        if docs_init:
-            self.docsearch = FAISS.from_documents(
-                docs_init, embeddings
-            )
-        else:
-            self.docsearch = FAISS.load_local(
-                self.path, embeddings, 
-                allow_dangerous_deserialization=True
-            )
+
+        try:
+            if docs_init:
+                self.docsearch = FAISS.from_documents(docs_init, embeddings)
+            else:
+                self.docsearch = FAISS.load_local(self.path, embeddings, allow_dangerous_deserialization=True)
+        except Exception:
+            raise
+
        self.assert_embedding_dimensions(embeddings)

    def search(self, *args, **kwargs):
@@ -42,16 +39,12 @@ class FaissStore(BaseVectorStore):
        return self.docsearch.delete(*args, **kwargs)

    def assert_embedding_dimensions(self, embeddings):
-        """
-        Check that the word embedding dimension of the docsearch index matches
-        the dimension of the word embeddings used 
-        """
+        """Check that the word embedding dimension of the docsearch index matches the dimension of the word embeddings used."""
        if settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2":
-            try:
-                word_embedding_dimension = embeddings.dimension
-            except AttributeError as e:
-                raise AttributeError("'dimension' attribute not found in embeddings instance. Make sure the embeddings object is properly initialized.") from e
+            word_embedding_dimension = getattr(embeddings, 'dimension', None)
+            if word_embedding_dimension is None:
+                raise AttributeError("'dimension' attribute not found in embeddings instance.")
+            
            docsearch_index_dimension = self.docsearch.index.d
            if word_embedding_dimension != docsearch_index_dimension:
-                raise ValueError(f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) " +
-                                 f"!= docsearch index dimension ({docsearch_index_dimension})")
+                raise ValueError(f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) != docsearch index dimension ({docsearch_index_dimension})")
--- a/frontend/src/locale/en.json
+++ b/frontend/src/locale/en.json
@@ -77,7 +77,7 @@
      "remote": "Remote",
      "name": "Name",
      "choose": "Choose Files",
-      "info": "Please upload .pdf, .txt, .rst, .csv, .docx, .md, .zip limited to 25mb",
+      "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .zip limited to 25mb",
      "uploadedFiles": "Uploaded Files",
      "cancel": "Cancel",
      "train": "Train",
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -270,6 +270,7 @@ function Upload({
      'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
        ['.docx'],
      'text/csv': ['.csv'],
+      'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
    },
  });

--- a/mlb_teams_2012.csv:Zone.Identifier
+++ b/mlb_teams_2012.csv:Zone.Identifier
@@ -0,0 +1,3 @@
+[ZoneTransfer]
+ZoneId=3
+HostUrl=https://github.com/
--- a/test.py
+++ b/test.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+from application.parser.file.tabular_parser import ExcelParser,PandasCSVParser
+
+# Define the path to the .xlsx file
+file_path = Path("/home/dev523/DocsGPT/Ledgers in Default Template.xlsx")
+parser = ExcelParser(concat_rows=True, pandas_config={})
+
+# Initialize the ExcelParser
+# file_path = Path("/home/dev523/DocsGPT/mlb_teams_2012.csv")
+# parser = PandasCSVParser(concat_rows=True, pandas_config={})
+
+
+
+# Initialize the parser configuration (this can be customized if needed)
+parser.init_parser()
+
+# Check if the parser config is set (this is optional)
+if parser.parser_config_set:
+    print("Parser config has been set.")
+
+# Parse the Excel file
+parsed_data = parser.parse_file(file_path)
+print(parsed_data)
+
+