feat: implement Docling parsers (#2202)

* feat: implement Docling parsers

* fix office

* docling-ocr-fix

* Docling smart ocr

* ruff fix

---------

Co-authored-by: Pavel <pabin@yandex.ru>
This commit is contained in:
Alex
2025-12-23 16:33:51 +00:00
committed by GitHub
parent 5b6cfa6ecc
commit ccd29b7d4e
7 changed files with 439 additions and 24 deletions

View File

@@ -76,7 +76,12 @@ class UploadFile(Resource):
temp_file_path = os.path.join(temp_dir, safe_file)
file.save(temp_file_path)
if zipfile.is_zipfile(temp_file_path):
# Only extract actual .zip files, not Office formats (.docx, .xlsx, .pptx)
# which are technically zip archives but should be processed as-is
is_office_format = safe_file.lower().endswith(
(".docx", ".xlsx", ".pptx", ".odt", ".ods", ".odp", ".epub")
)
if zipfile.is_zipfile(temp_file_path) and not is_office_format:
try:
with zipfile.ZipFile(temp_file_path, "r") as zip_ref:
zip_ref.extractall(path=temp_dir)