feat: enable OCR for docling when parsing attachments and update file extractor (#2246)

This commit is contained in:
Alex
2025-12-31 00:08:49 +00:00
committed by GitHub
parent 9e7f1ad1c0
commit 05c835ed02
3 changed files with 15 additions and 5 deletions

View File

@@ -19,7 +19,9 @@ from application.utils import num_tokens_from_string
from application.core.settings import settings
def get_default_file_extractor() -> Dict[str, BaseParser]:
def get_default_file_extractor(
ocr_enabled: Optional[bool] = None,
) -> Dict[str, BaseParser]:
"""Get the default file extractor.
Uses docling parsers by default for advanced document processing.
@@ -38,7 +40,8 @@ def get_default_file_extractor() -> Dict[str, BaseParser]:
DoclingVTTParser,
DoclingXMLParser,
)
ocr_enabled = settings.DOCLING_OCR_ENABLED
if ocr_enabled is None:
ocr_enabled = settings.DOCLING_OCR_ENABLED
return {
# Documents
".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled),
@@ -320,4 +323,4 @@ class SimpleDirectoryReader(BaseReader):
return result
return build_tree(Path(base_path))
return build_tree(Path(base_path))