From 7794129929f2e8d9547702330dbd2638fc6ccf97 Mon Sep 17 00:00:00 2001
From: "devendra.parihar" <devendra.parihar@heliossolutions.co>
Date: Tue, 1 Oct 2024 22:03:10 +0530
Subject: [PATCH 1/3] new: added ExcelParser(tested) to read .xlsx files

---
 application/parser/file/bulk.py           |  3 +-
 application/parser/file/tabular_parser.py | 65 +++++++++++++++++++++++
 application/requirements.txt              |  1 +
 application/vectorstore/faiss.py          | 43 +++++++--------
 frontend/src/locale/en.json               |  2 +-
 frontend/src/upload/Upload.tsx            |  1 +
 mlb_teams_2012.csv:Zone.Identifier        |  3 ++
 test.py                                   | 25 +++++++++
 8 files changed, 116 insertions(+), 27 deletions(-)
 create mode 100644 mlb_teams_2012.csv:Zone.Identifier
 create mode 100644 test.py

diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py
index aec6c8c1..79fc2c45 100644
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -10,13 +10,14 @@ from application.parser.file.epub_parser import EpubParser
 from application.parser.file.html_parser import HTMLParser
 from application.parser.file.markdown_parser import MarkdownParser
 from application.parser.file.rst_parser import RstParser
-from application.parser.file.tabular_parser import PandasCSVParser
+from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
 from application.parser.schema.base import Document
 
 DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
     ".pdf": PDFParser(),
     ".docx": DocxParser(),
     ".csv": PandasCSVParser(),
+    ".xlsx":ExcelParser(),
     ".epub": EpubParser(),
     ".md": MarkdownParser(),
     ".rst": RstParser(),
diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py
index 81355ae0..b2dbd193 100644
--- a/application/parser/file/tabular_parser.py
+++ b/application/parser/file/tabular_parser.py
@@ -113,3 +113,68 @@ class PandasCSVParser(BaseParser):
             return (self._row_joiner).join(text_list)
         else:
             return text_list
+
+
+class ExcelParser(BaseParser):
+    r"""Excel (.xlsx) parser.
+
+    Parses Excel files using Pandas `read_excel` function.
+    If special parameters are required, use the `pandas_config` dict.
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+
+        col_joiner (str): Separator to use for joining cols per row.
+            Set to ", " by default.
+
+        row_joiner (str): Separator to use for joining each row.
+            Only used when `concat_rows=True`.
+            Set to "\n" by default.
+
+        pandas_config (dict): Options for the `pandas.read_excel` function call.
+            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
+            for more information.
+            Set to empty dict by default, this means pandas will try to figure
+            out the table structure on its own.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            concat_rows: bool = True,
+            col_joiner: str = ", ",
+            row_joiner: str = "\n",
+            pandas_config: dict = {},
+            **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._col_joiner = col_joiner
+        self._row_joiner = row_joiner
+        self._pandas_config = pandas_config
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file."""
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ValueError("pandas module is required to read Excel files.")
+
+        df = pd.read_excel(file, **self._pandas_config)
+
+        text_list = df.apply(
+            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+        ).tolist()
+
+        if self._concat_rows:
+            return (self._row_joiner).join(text_list)
+        else:
+            return text_list
\ No newline at end of file
diff --git a/application/requirements.txt b/application/requirements.txt
index d7621cfd..6a57dd12 100644
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -49,6 +49,7 @@ openapi3-parser==1.1.18
 orjson==3.10.7
 packaging==24.1
 pandas==2.2.3
+openpyxl==3.1.5
 pathable==0.4.3
 pillow==10.4.0
 portalocker==2.10.1
diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py
index a8839cd2..afa55db9 100644
--- a/application/vectorstore/faiss.py
+++ b/application/vectorstore/faiss.py
@@ -3,30 +3,27 @@ from application.vectorstore.base import BaseVectorStore
 from application.core.settings import settings
 import os
 
-def get_vectorstore(path):
+def get_vectorstore(path: str) -> str:
     if path:
-        vectorstore = "indexes/"+path
-        vectorstore = os.path.join("application", vectorstore)
+        vectorstore = os.path.join("application", "indexes", path)
     else:
         vectorstore = os.path.join("application")
-
     return vectorstore
 
 class FaissStore(BaseVectorStore):
-
-    def __init__(self, source_id, embeddings_key, docs_init=None):
+    def __init__(self, source_id: str, embeddings_key: str, docs_init=None):
         super().__init__()
         self.path = get_vectorstore(source_id)
         embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
-        if docs_init:
-            self.docsearch = FAISS.from_documents(
-                docs_init, embeddings
-            )
-        else:
-            self.docsearch = FAISS.load_local(
-                self.path, embeddings, 
-                allow_dangerous_deserialization=True
-            )
+
+        try:
+            if docs_init:
+                self.docsearch = FAISS.from_documents(docs_init, embeddings)
+            else:
+                self.docsearch = FAISS.load_local(self.path, embeddings, allow_dangerous_deserialization=True)
+        except Exception:
+            raise
+
         self.assert_embedding_dimensions(embeddings)
 
     def search(self, *args, **kwargs):
@@ -42,16 +39,12 @@ class FaissStore(BaseVectorStore):
         return self.docsearch.delete(*args, **kwargs)
 
     def assert_embedding_dimensions(self, embeddings):
-        """
-        Check that the word embedding dimension of the docsearch index matches
-        the dimension of the word embeddings used 
-        """
+        """Check that the word embedding dimension of the docsearch index matches the dimension of the word embeddings used."""
         if settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2":
-            try:
-                word_embedding_dimension = embeddings.dimension
-            except AttributeError as e:
-                raise AttributeError("'dimension' attribute not found in embeddings instance. Make sure the embeddings object is properly initialized.") from e
+            word_embedding_dimension = getattr(embeddings, 'dimension', None)
+            if word_embedding_dimension is None:
+                raise AttributeError("'dimension' attribute not found in embeddings instance.")
+            
             docsearch_index_dimension = self.docsearch.index.d
             if word_embedding_dimension != docsearch_index_dimension:
-                raise ValueError(f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) " +
-                                 f"!= docsearch index dimension ({docsearch_index_dimension})")
\ No newline at end of file
+                raise ValueError(f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) != docsearch index dimension ({docsearch_index_dimension})")
diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json
index 645703a2..7ba8add1 100644
--- a/frontend/src/locale/en.json
+++ b/frontend/src/locale/en.json
@@ -77,7 +77,7 @@
       "remote": "Remote",
       "name": "Name",
       "choose": "Choose Files",
-      "info": "Please upload .pdf, .txt, .rst, .csv, .docx, .md, .zip limited to 25mb",
+      "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .zip limited to 25mb",
       "uploadedFiles": "Uploaded Files",
       "cancel": "Cancel",
       "train": "Train",
diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx
index b898e4b6..cf56a21b 100644
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -270,6 +270,7 @@ function Upload({
       'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
         ['.docx'],
       'text/csv': ['.csv'],
+      'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
     },
   });
 
diff --git a/mlb_teams_2012.csv:Zone.Identifier b/mlb_teams_2012.csv:Zone.Identifier
new file mode 100644
index 00000000..1bf0b28e
--- /dev/null
+++ b/mlb_teams_2012.csv:Zone.Identifier
@@ -0,0 +1,3 @@
+[ZoneTransfer]
+ZoneId=3
+HostUrl=https://github.com/
diff --git a/test.py b/test.py
new file mode 100644
index 00000000..59d75188
--- /dev/null
+++ b/test.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+from application.parser.file.tabular_parser import ExcelParser,PandasCSVParser
+
+# Define the path to the .xlsx file
+file_path = Path("/home/dev523/DocsGPT/Ledgers in Default Template.xlsx")
+parser = ExcelParser(concat_rows=True, pandas_config={})
+
+# Initialize the ExcelParser
+# file_path = Path("/home/dev523/DocsGPT/mlb_teams_2012.csv")
+# parser = PandasCSVParser(concat_rows=True, pandas_config={})
+
+
+
+# Initialize the parser configuration (this can be customized if needed)
+parser.init_parser()
+
+# Check if the parser config is set (this is optional)
+if parser.parser_config_set:
+    print("Parser config has been set.")
+
+# Parse the Excel file
+parsed_data = parser.parse_file(file_path)
+print(parsed_data)
+
+

From e9a7722915aefafd58d9077a8b6536c6150ce495 Mon Sep 17 00:00:00 2001
From: "devendra.parihar" <devendra.parihar@heliossolutions.co>
Date: Tue, 1 Oct 2024 22:11:17 +0530
Subject: [PATCH 2/3] new: added ExcelParser(tested) to read .xlsx files

---
 test.py | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 test.py

diff --git a/test.py b/test.py
deleted file mode 100644
index 59d75188..00000000
--- a/test.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from pathlib import Path
-from application.parser.file.tabular_parser import ExcelParser,PandasCSVParser
-
-# Define the path to the .xlsx file
-file_path = Path("/home/dev523/DocsGPT/Ledgers in Default Template.xlsx")
-parser = ExcelParser(concat_rows=True, pandas_config={})
-
-# Initialize the ExcelParser
-# file_path = Path("/home/dev523/DocsGPT/mlb_teams_2012.csv")
-# parser = PandasCSVParser(concat_rows=True, pandas_config={})
-
-
-
-# Initialize the parser configuration (this can be customized if needed)
-parser.init_parser()
-
-# Check if the parser config is set (this is optional)
-if parser.parser_config_set:
-    print("Parser config has been set.")
-
-# Parse the Excel file
-parsed_data = parser.parse_file(file_path)
-print(parsed_data)
-
-

From 09a15e2e59b119055ca31883b9da9b402c333c25 Mon Sep 17 00:00:00 2001
From: Devendra Parihar <54232149+Devparihar5@users.noreply.github.com>
Date: Fri, 4 Oct 2024 22:59:40 +0530
Subject: [PATCH 3/3] Delete mlb_teams_2012.csv:Zone.Identifier

---
 mlb_teams_2012.csv:Zone.Identifier | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 mlb_teams_2012.csv:Zone.Identifier

diff --git a/mlb_teams_2012.csv:Zone.Identifier b/mlb_teams_2012.csv:Zone.Identifier
deleted file mode 100644
index 1bf0b28e..00000000
--- a/mlb_teams_2012.csv:Zone.Identifier
+++ /dev/null
@@ -1,3 +0,0 @@
-[ZoneTransfer]
-ZoneId=3
-HostUrl=https://github.com/