feat: enable OCR for docling when parsing attachments and update file extractor (#2246)

2026-02-13 09:40:48 +00:00 · 2025-12-31 00:08:49 +00:00
parent 9e7f1ad1c0
commit 05c835ed02
3 changed files with 15 additions and 5 deletions
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -19,7 +19,9 @@ from application.utils import num_tokens_from_string
 from application.core.settings import settings


-def get_default_file_extractor() -> Dict[str, BaseParser]:
+def get_default_file_extractor(
+    ocr_enabled: Optional[bool] = None,
+) -> Dict[str, BaseParser]:
    """Get the default file extractor.

    Uses docling parsers by default for advanced document processing.
@@ -38,7 +40,8 @@ def get_default_file_extractor() -> Dict[str, BaseParser]:
            DoclingVTTParser,
            DoclingXMLParser,
        )
-        ocr_enabled = settings.DOCLING_OCR_ENABLED
+        if ocr_enabled is None:
+            ocr_enabled = settings.DOCLING_OCR_ENABLED
        return {
            # Documents
            ".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled),
@@ -320,4 +323,4 @@ class SimpleDirectoryReader(BaseReader):
                    
            return result
        
-        return build_tree(Path(base_path))
+        return build_tree(Path(base_path))