Merge pull request #1184 from Devparihar5/ExcelParser

new: added ExcelParser(tested) to read .xlsx files
This commit is contained in:
Alex
2024-10-06 23:19:37 +01:00
committed by GitHub
6 changed files with 71 additions and 3 deletions

View File

@@ -10,13 +10,14 @@ from application.parser.file.epub_parser import EpubParser
from application.parser.file.html_parser import HTMLParser
from application.parser.file.markdown_parser import MarkdownParser
from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
from application.parser.schema.base import Document
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".pdf": PDFParser(),
".docx": DocxParser(),
".csv": PandasCSVParser(),
".xlsx":ExcelParser(),
".epub": EpubParser(),
".md": MarkdownParser(),
".rst": RstParser(),

View File

@@ -113,3 +113,68 @@ class PandasCSVParser(BaseParser):
return (self._row_joiner).join(text_list)
else:
return text_list
class ExcelParser(BaseParser):
r"""Excel (.xlsx) parser.
Parses Excel files using Pandas `read_excel` function.
If special parameters are required, use the `pandas_config` dict.
Args:
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
col_joiner (str): Separator to use for joining cols per row.
Set to ", " by default.
row_joiner (str): Separator to use for joining each row.
Only used when `concat_rows=True`.
Set to "\n" by default.
pandas_config (dict): Options for the `pandas.read_excel` function call.
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
for more information.
Set to empty dict by default, this means pandas will try to figure
out the table structure on its own.
"""
def __init__(
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
self._col_joiner = col_joiner
self._row_joiner = row_joiner
self._pandas_config = pandas_config
def _init_parser(self) -> Dict:
"""Init parser."""
return {}
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
"""Parse file."""
try:
import pandas as pd
except ImportError:
raise ValueError("pandas module is required to read Excel files.")
df = pd.read_excel(file, **self._pandas_config)
text_list = df.apply(
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
).tolist()
if self._concat_rows:
return (self._row_joiner).join(text_list)
else:
return text_list

View File

@@ -49,6 +49,7 @@ openapi3-parser==1.1.18
orjson==3.10.7
packaging==24.1
pandas==2.2.3
openpyxl==3.1.5
pathable==0.4.3
pillow==10.4.0
portalocker==2.10.1

View File

@@ -22,7 +22,7 @@ class FaissStore(BaseVectorStore):
else:
self.docsearch = FAISS.load_local(self.path, embeddings, allow_dangerous_deserialization=True)
except Exception:
raise # Just re-raise the exception without assigning to e
raise
self.assert_embedding_dimensions(embeddings)

View File

@@ -79,7 +79,7 @@
"remote": "Remote",
"name": "Name",
"choose": "Choose Files",
"info": "Please upload .pdf, .txt, .rst, .csv, .docx, .md, .zip limited to 25mb",
"info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .zip limited to 25mb",
"uploadedFiles": "Uploaded Files",
"cancel": "Cancel",
"train": "Train",

View File

@@ -275,6 +275,7 @@ function Upload({
'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
['.docx'],
'text/csv': ['.csv'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
},
});