new: added ExcelParser(tested) to read .xlsx files

This commit is contained in:
devendra.parihar
2024-10-01 22:03:10 +05:30
parent af1b81097f
commit 7794129929
8 changed files with 116 additions and 27 deletions

View File

@@ -10,13 +10,14 @@ from application.parser.file.epub_parser import EpubParser
from application.parser.file.html_parser import HTMLParser
from application.parser.file.markdown_parser import MarkdownParser
from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
from application.parser.schema.base import Document
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".pdf": PDFParser(),
".docx": DocxParser(),
".csv": PandasCSVParser(),
".xlsx":ExcelParser(),
".epub": EpubParser(),
".md": MarkdownParser(),
".rst": RstParser(),

View File

@@ -113,3 +113,68 @@ class PandasCSVParser(BaseParser):
return (self._row_joiner).join(text_list)
else:
return text_list
class ExcelParser(BaseParser):
r"""Excel (.xlsx) parser.
Parses Excel files using Pandas `read_excel` function.
If special parameters are required, use the `pandas_config` dict.
Args:
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
col_joiner (str): Separator to use for joining cols per row.
Set to ", " by default.
row_joiner (str): Separator to use for joining each row.
Only used when `concat_rows=True`.
Set to "\n" by default.
pandas_config (dict): Options for the `pandas.read_excel` function call.
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
for more information.
Set to empty dict by default, this means pandas will try to figure
out the table structure on its own.
"""
def __init__(
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
self._col_joiner = col_joiner
self._row_joiner = row_joiner
self._pandas_config = pandas_config
def _init_parser(self) -> Dict:
"""Init parser."""
return {}
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
"""Parse file."""
try:
import pandas as pd
except ImportError:
raise ValueError("pandas module is required to read Excel files.")
df = pd.read_excel(file, **self._pandas_config)
text_list = df.apply(
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
).tolist()
if self._concat_rows:
return (self._row_joiner).join(text_list)
else:
return text_list

View File

@@ -49,6 +49,7 @@ openapi3-parser==1.1.18
orjson==3.10.7
packaging==24.1
pandas==2.2.3
openpyxl==3.1.5
pathable==0.4.3
pillow==10.4.0
portalocker==2.10.1

View File

@@ -3,30 +3,27 @@ from application.vectorstore.base import BaseVectorStore
from application.core.settings import settings
import os
def get_vectorstore(path):
def get_vectorstore(path: str) -> str:
if path:
vectorstore = "indexes/"+path
vectorstore = os.path.join("application", vectorstore)
vectorstore = os.path.join("application", "indexes", path)
else:
vectorstore = os.path.join("application")
return vectorstore
class FaissStore(BaseVectorStore):
def __init__(self, source_id, embeddings_key, docs_init=None):
def __init__(self, source_id: str, embeddings_key: str, docs_init=None):
super().__init__()
self.path = get_vectorstore(source_id)
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
if docs_init:
self.docsearch = FAISS.from_documents(
docs_init, embeddings
)
else:
self.docsearch = FAISS.load_local(
self.path, embeddings,
allow_dangerous_deserialization=True
)
try:
if docs_init:
self.docsearch = FAISS.from_documents(docs_init, embeddings)
else:
self.docsearch = FAISS.load_local(self.path, embeddings, allow_dangerous_deserialization=True)
except Exception:
raise
self.assert_embedding_dimensions(embeddings)
def search(self, *args, **kwargs):
@@ -42,16 +39,12 @@ class FaissStore(BaseVectorStore):
return self.docsearch.delete(*args, **kwargs)
def assert_embedding_dimensions(self, embeddings):
"""
Check that the word embedding dimension of the docsearch index matches
the dimension of the word embeddings used
"""
"""Check that the word embedding dimension of the docsearch index matches the dimension of the word embeddings used."""
if settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2":
try:
word_embedding_dimension = embeddings.dimension
except AttributeError as e:
raise AttributeError("'dimension' attribute not found in embeddings instance. Make sure the embeddings object is properly initialized.") from e
word_embedding_dimension = getattr(embeddings, 'dimension', None)
if word_embedding_dimension is None:
raise AttributeError("'dimension' attribute not found in embeddings instance.")
docsearch_index_dimension = self.docsearch.index.d
if word_embedding_dimension != docsearch_index_dimension:
raise ValueError(f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) " +
f"!= docsearch index dimension ({docsearch_index_dimension})")
raise ValueError(f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) != docsearch index dimension ({docsearch_index_dimension})")

View File

@@ -77,7 +77,7 @@
"remote": "Remote",
"name": "Name",
"choose": "Choose Files",
"info": "Please upload .pdf, .txt, .rst, .csv, .docx, .md, .zip limited to 25mb",
"info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .zip limited to 25mb",
"uploadedFiles": "Uploaded Files",
"cancel": "Cancel",
"train": "Train",

View File

@@ -270,6 +270,7 @@ function Upload({
'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
['.docx'],
'text/csv': ['.csv'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
},
});

View File

@@ -0,0 +1,3 @@
[ZoneTransfer]
ZoneId=3
HostUrl=https://github.com/

25
test.py Normal file
View File

@@ -0,0 +1,25 @@
from pathlib import Path
from application.parser.file.tabular_parser import ExcelParser,PandasCSVParser
# Define the path to the .xlsx file
file_path = Path("/home/dev523/DocsGPT/Ledgers in Default Template.xlsx")
parser = ExcelParser(concat_rows=True, pandas_config={})
# Initialize the ExcelParser
# file_path = Path("/home/dev523/DocsGPT/mlb_teams_2012.csv")
# parser = PandasCSVParser(concat_rows=True, pandas_config={})
# Initialize the parser configuration (this can be customized if needed)
parser.init_parser()
# Check if the parser config is set (this is optional)
if parser.parser_config_set:
print("Parser config has been set.")
# Parse the Excel file
parsed_data = parser.parse_file(file_path)
print(parsed_data)