mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
Merge pull request #1184 from Devparihar5/ExcelParser
new: added ExcelParser(tested) to read .xlsx files
This commit is contained in:
@@ -10,13 +10,14 @@ from application.parser.file.epub_parser import EpubParser
|
||||
from application.parser.file.html_parser import HTMLParser
|
||||
from application.parser.file.markdown_parser import MarkdownParser
|
||||
from application.parser.file.rst_parser import RstParser
|
||||
from application.parser.file.tabular_parser import PandasCSVParser
|
||||
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
|
||||
from application.parser.schema.base import Document
|
||||
|
||||
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
||||
".pdf": PDFParser(),
|
||||
".docx": DocxParser(),
|
||||
".csv": PandasCSVParser(),
|
||||
".xlsx":ExcelParser(),
|
||||
".epub": EpubParser(),
|
||||
".md": MarkdownParser(),
|
||||
".rst": RstParser(),
|
||||
|
||||
@@ -113,3 +113,68 @@ class PandasCSVParser(BaseParser):
|
||||
return (self._row_joiner).join(text_list)
|
||||
else:
|
||||
return text_list
|
||||
|
||||
|
||||
class ExcelParser(BaseParser):
|
||||
r"""Excel (.xlsx) parser.
|
||||
|
||||
Parses Excel files using Pandas `read_excel` function.
|
||||
If special parameters are required, use the `pandas_config` dict.
|
||||
|
||||
Args:
|
||||
concat_rows (bool): whether to concatenate all rows into one document.
|
||||
If set to False, a Document will be created for each row.
|
||||
True by default.
|
||||
|
||||
col_joiner (str): Separator to use for joining cols per row.
|
||||
Set to ", " by default.
|
||||
|
||||
row_joiner (str): Separator to use for joining each row.
|
||||
Only used when `concat_rows=True`.
|
||||
Set to "\n" by default.
|
||||
|
||||
pandas_config (dict): Options for the `pandas.read_excel` function call.
|
||||
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
|
||||
for more information.
|
||||
Set to empty dict by default, this means pandas will try to figure
|
||||
out the table structure on its own.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
concat_rows: bool = True,
|
||||
col_joiner: str = ", ",
|
||||
row_joiner: str = "\n",
|
||||
pandas_config: dict = {},
|
||||
**kwargs: Any
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
super().__init__(*args, **kwargs)
|
||||
self._concat_rows = concat_rows
|
||||
self._col_joiner = col_joiner
|
||||
self._row_joiner = row_joiner
|
||||
self._pandas_config = pandas_config
|
||||
|
||||
def _init_parser(self) -> Dict:
|
||||
"""Init parser."""
|
||||
return {}
|
||||
|
||||
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
|
||||
"""Parse file."""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
raise ValueError("pandas module is required to read Excel files.")
|
||||
|
||||
df = pd.read_excel(file, **self._pandas_config)
|
||||
|
||||
text_list = df.apply(
|
||||
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
|
||||
).tolist()
|
||||
|
||||
if self._concat_rows:
|
||||
return (self._row_joiner).join(text_list)
|
||||
else:
|
||||
return text_list
|
||||
@@ -49,6 +49,7 @@ openapi3-parser==1.1.18
|
||||
orjson==3.10.7
|
||||
packaging==24.1
|
||||
pandas==2.2.3
|
||||
openpyxl==3.1.5
|
||||
pathable==0.4.3
|
||||
pillow==10.4.0
|
||||
portalocker==2.10.1
|
||||
|
||||
@@ -22,7 +22,7 @@ class FaissStore(BaseVectorStore):
|
||||
else:
|
||||
self.docsearch = FAISS.load_local(self.path, embeddings, allow_dangerous_deserialization=True)
|
||||
except Exception:
|
||||
raise # Just re-raise the exception without assigning to e
|
||||
raise
|
||||
|
||||
self.assert_embedding_dimensions(embeddings)
|
||||
|
||||
|
||||
@@ -79,7 +79,7 @@
|
||||
"remote": "Remote",
|
||||
"name": "Name",
|
||||
"choose": "Choose Files",
|
||||
"info": "Please upload .pdf, .txt, .rst, .csv, .docx, .md, .zip limited to 25mb",
|
||||
"info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .zip limited to 25mb",
|
||||
"uploadedFiles": "Uploaded Files",
|
||||
"cancel": "Cancel",
|
||||
"train": "Train",
|
||||
|
||||
@@ -275,6 +275,7 @@ function Upload({
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
|
||||
['.docx'],
|
||||
'text/csv': ['.csv'],
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
|
||||
},
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user