From 7794129929f2e8d9547702330dbd2638fc6ccf97 Mon Sep 17 00:00:00 2001 From: "devendra.parihar" Date: Tue, 1 Oct 2024 22:03:10 +0530 Subject: [PATCH 1/3] new: added ExcelParser(tested) to read .xlsx files --- application/parser/file/bulk.py | 3 +- application/parser/file/tabular_parser.py | 65 +++++++++++++++++++++++ application/requirements.txt | 1 + application/vectorstore/faiss.py | 43 +++++++-------- frontend/src/locale/en.json | 2 +- frontend/src/upload/Upload.tsx | 1 + mlb_teams_2012.csv:Zone.Identifier | 3 ++ test.py | 25 +++++++++ 8 files changed, 116 insertions(+), 27 deletions(-) create mode 100644 mlb_teams_2012.csv:Zone.Identifier create mode 100644 test.py diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index aec6c8c1..79fc2c45 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -10,13 +10,14 @@ from application.parser.file.epub_parser import EpubParser from application.parser.file.html_parser import HTMLParser from application.parser.file.markdown_parser import MarkdownParser from application.parser.file.rst_parser import RstParser -from application.parser.file.tabular_parser import PandasCSVParser +from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser from application.parser.schema.base import Document DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".pdf": PDFParser(), ".docx": DocxParser(), ".csv": PandasCSVParser(), + ".xlsx":ExcelParser(), ".epub": EpubParser(), ".md": MarkdownParser(), ".rst": RstParser(), diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py index 81355ae0..b2dbd193 100644 --- a/application/parser/file/tabular_parser.py +++ b/application/parser/file/tabular_parser.py @@ -113,3 +113,68 @@ class PandasCSVParser(BaseParser): return (self._row_joiner).join(text_list) else: return text_list + + +class ExcelParser(BaseParser): + r"""Excel (.xlsx) parser. + + Parses Excel files using Pandas `read_excel` function. + If special parameters are required, use the `pandas_config` dict. + + Args: + concat_rows (bool): whether to concatenate all rows into one document. + If set to False, a Document will be created for each row. + True by default. + + col_joiner (str): Separator to use for joining cols per row. + Set to ", " by default. + + row_joiner (str): Separator to use for joining each row. + Only used when `concat_rows=True`. + Set to "\n" by default. + + pandas_config (dict): Options for the `pandas.read_excel` function call. + Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html + for more information. + Set to empty dict by default, this means pandas will try to figure + out the table structure on its own. + + """ + + def __init__( + self, + *args: Any, + concat_rows: bool = True, + col_joiner: str = ", ", + row_joiner: str = "\n", + pandas_config: dict = {}, + **kwargs: Any + ) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._concat_rows = concat_rows + self._col_joiner = col_joiner + self._row_joiner = row_joiner + self._pandas_config = pandas_config + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + """Parse file.""" + try: + import pandas as pd + except ImportError: + raise ValueError("pandas module is required to read Excel files.") + + df = pd.read_excel(file, **self._pandas_config) + + text_list = df.apply( + lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 + ).tolist() + + if self._concat_rows: + return (self._row_joiner).join(text_list) + else: + return text_list \ No newline at end of file diff --git a/application/requirements.txt b/application/requirements.txt index d7621cfd..6a57dd12 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -49,6 +49,7 @@ openapi3-parser==1.1.18 orjson==3.10.7 packaging==24.1 pandas==2.2.3 +openpyxl==3.1.5 pathable==0.4.3 pillow==10.4.0 portalocker==2.10.1 diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index a8839cd2..afa55db9 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -3,30 +3,27 @@ from application.vectorstore.base import BaseVectorStore from application.core.settings import settings import os -def get_vectorstore(path): +def get_vectorstore(path: str) -> str: if path: - vectorstore = "indexes/"+path - vectorstore = os.path.join("application", vectorstore) + vectorstore = os.path.join("application", "indexes", path) else: vectorstore = os.path.join("application") - return vectorstore class FaissStore(BaseVectorStore): - - def __init__(self, source_id, embeddings_key, docs_init=None): + def __init__(self, source_id: str, embeddings_key: str, docs_init=None): super().__init__() self.path = get_vectorstore(source_id) embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) - if docs_init: - self.docsearch = FAISS.from_documents( - docs_init, embeddings - ) - else: - self.docsearch = FAISS.load_local( - self.path, embeddings, - allow_dangerous_deserialization=True - ) + + try: + if docs_init: + self.docsearch = FAISS.from_documents(docs_init, embeddings) + else: + self.docsearch = FAISS.load_local(self.path, embeddings, allow_dangerous_deserialization=True) + except Exception: + raise + self.assert_embedding_dimensions(embeddings) def search(self, *args, **kwargs): @@ -42,16 +39,12 @@ class FaissStore(BaseVectorStore): return self.docsearch.delete(*args, **kwargs) def assert_embedding_dimensions(self, embeddings): - """ - Check that the word embedding dimension of the docsearch index matches - the dimension of the word embeddings used - """ + """Check that the word embedding dimension of the docsearch index matches the dimension of the word embeddings used.""" if settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2": - try: - word_embedding_dimension = embeddings.dimension - except AttributeError as e: - raise AttributeError("'dimension' attribute not found in embeddings instance. Make sure the embeddings object is properly initialized.") from e + word_embedding_dimension = getattr(embeddings, 'dimension', None) + if word_embedding_dimension is None: + raise AttributeError("'dimension' attribute not found in embeddings instance.") + docsearch_index_dimension = self.docsearch.index.d if word_embedding_dimension != docsearch_index_dimension: - raise ValueError(f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) " + - f"!= docsearch index dimension ({docsearch_index_dimension})") \ No newline at end of file + raise ValueError(f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) != docsearch index dimension ({docsearch_index_dimension})") diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index 645703a2..7ba8add1 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -77,7 +77,7 @@ "remote": "Remote", "name": "Name", "choose": "Choose Files", - "info": "Please upload .pdf, .txt, .rst, .csv, .docx, .md, .zip limited to 25mb", + "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .zip limited to 25mb", "uploadedFiles": "Uploaded Files", "cancel": "Cancel", "train": "Train", diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index b898e4b6..cf56a21b 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -270,6 +270,7 @@ function Upload({ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], 'text/csv': ['.csv'], + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], }, }); diff --git a/mlb_teams_2012.csv:Zone.Identifier b/mlb_teams_2012.csv:Zone.Identifier new file mode 100644 index 00000000..1bf0b28e --- /dev/null +++ b/mlb_teams_2012.csv:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=https://github.com/ diff --git a/test.py b/test.py new file mode 100644 index 00000000..59d75188 --- /dev/null +++ b/test.py @@ -0,0 +1,25 @@ +from pathlib import Path +from application.parser.file.tabular_parser import ExcelParser,PandasCSVParser + +# Define the path to the .xlsx file +file_path = Path("/home/dev523/DocsGPT/Ledgers in Default Template.xlsx") +parser = ExcelParser(concat_rows=True, pandas_config={}) + +# Initialize the ExcelParser +# file_path = Path("/home/dev523/DocsGPT/mlb_teams_2012.csv") +# parser = PandasCSVParser(concat_rows=True, pandas_config={}) + + + +# Initialize the parser configuration (this can be customized if needed) +parser.init_parser() + +# Check if the parser config is set (this is optional) +if parser.parser_config_set: + print("Parser config has been set.") + +# Parse the Excel file +parsed_data = parser.parse_file(file_path) +print(parsed_data) + + From e9a7722915aefafd58d9077a8b6536c6150ce495 Mon Sep 17 00:00:00 2001 From: "devendra.parihar" Date: Tue, 1 Oct 2024 22:11:17 +0530 Subject: [PATCH 2/3] new: added ExcelParser(tested) to read .xlsx files --- test.py | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 59d75188..00000000 --- a/test.py +++ /dev/null @@ -1,25 +0,0 @@ -from pathlib import Path -from application.parser.file.tabular_parser import ExcelParser,PandasCSVParser - -# Define the path to the .xlsx file -file_path = Path("/home/dev523/DocsGPT/Ledgers in Default Template.xlsx") -parser = ExcelParser(concat_rows=True, pandas_config={}) - -# Initialize the ExcelParser -# file_path = Path("/home/dev523/DocsGPT/mlb_teams_2012.csv") -# parser = PandasCSVParser(concat_rows=True, pandas_config={}) - - - -# Initialize the parser configuration (this can be customized if needed) -parser.init_parser() - -# Check if the parser config is set (this is optional) -if parser.parser_config_set: - print("Parser config has been set.") - -# Parse the Excel file -parsed_data = parser.parse_file(file_path) -print(parsed_data) - - From 09a15e2e59b119055ca31883b9da9b402c333c25 Mon Sep 17 00:00:00 2001 From: Devendra Parihar <54232149+Devparihar5@users.noreply.github.com> Date: Fri, 4 Oct 2024 22:59:40 +0530 Subject: [PATCH 3/3] Delete mlb_teams_2012.csv:Zone.Identifier --- mlb_teams_2012.csv:Zone.Identifier | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 mlb_teams_2012.csv:Zone.Identifier diff --git a/mlb_teams_2012.csv:Zone.Identifier b/mlb_teams_2012.csv:Zone.Identifier deleted file mode 100644 index 1bf0b28e..00000000 --- a/mlb_teams_2012.csv:Zone.Identifier +++ /dev/null @@ -1,3 +0,0 @@ -[ZoneTransfer] -ZoneId=3 -HostUrl=https://github.com/