feat: process pdfs synthetically im model does not support file natively (#2263)

* feat: process pdfs synthetically im model does not support file natively

* fix: small code optimisations
This commit is contained in:
Alex
2026-01-15 02:30:33 +02:00
committed by GitHub
parent 2c55c6cd9a
commit f61d112cea
13 changed files with 449 additions and 72 deletions

View File

@@ -60,14 +60,14 @@ def get_default_file_extractor(
".rst": RstParser(),
".adoc": DoclingAsciiDocParser(),
".asciidoc": DoclingAsciiDocParser(),
# Images (with OCR)
".png": DoclingImageParser(ocr_enabled=ocr_enabled),
".jpg": DoclingImageParser(ocr_enabled=ocr_enabled),
".jpeg": DoclingImageParser(ocr_enabled=ocr_enabled),
".tiff": DoclingImageParser(ocr_enabled=ocr_enabled),
".tif": DoclingImageParser(ocr_enabled=ocr_enabled),
".bmp": DoclingImageParser(ocr_enabled=ocr_enabled),
".webp": DoclingImageParser(ocr_enabled=ocr_enabled),
# Images (with OCR) - only use Docling when OCR is enabled
".png": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
".jpg": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
".jpeg": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
".tiff": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
".tif": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
".bmp": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
".webp": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
# Media/subtitles
".vtt": DoclingVTTParser(),
# Specialized XML formats