feat: enable OCR for docling when parsing attachments and update file extractor (#2246)

2026-03-07 22:33:36 +00:00 · 2025-12-31 00:08:49 +00:00
parent 9e7f1ad1c0
commit 05c835ed02
3 changed files with 15 additions and 5 deletions
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -43,6 +43,7 @@ class Settings(BaseSettings):
    PARSE_PDF_AS_IMAGE: bool = False
    PARSE_IMAGE_REMOTE: bool = False
    DOCLING_OCR_ENABLED: bool = True  # Enable OCR for docling parsers (PDF, images)
+    DOCLING_OCR_ATTACHMENTS_ENABLED: bool = False  # Enable OCR for docling when parsing attachments
    VECTOR_STORE: str = (
        "faiss"  #  "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" or "pgvector"
    )
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -19,7 +19,9 @@ from application.utils import num_tokens_from_string
 from application.core.settings import settings


-def get_default_file_extractor() -> Dict[str, BaseParser]:
+def get_default_file_extractor(
+    ocr_enabled: Optional[bool] = None,
+) -> Dict[str, BaseParser]:
    """Get the default file extractor.

    Uses docling parsers by default for advanced document processing.
@@ -38,7 +40,8 @@ def get_default_file_extractor() -> Dict[str, BaseParser]:
            DoclingVTTParser,
            DoclingXMLParser,
        )
-        ocr_enabled = settings.DOCLING_OCR_ENABLED
+        if ocr_enabled is None:
+            ocr_enabled = settings.DOCLING_OCR_ENABLED
        return {
            # Documents
            ".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled),
@@ -320,4 +323,4 @@ class SimpleDirectoryReader(BaseReader):
                    
            return result
        
-        return build_tree(Path(base_path))
+        return build_tree(Path(base_path))
--- a/application/worker.py
+++ b/application/worker.py
@@ -25,7 +25,7 @@ from application.core.settings import settings
 from application.parser.chunking import Chunker
 from application.parser.connectors.connector_creator import ConnectorCreator
 from application.parser.embedding_pipeline import embed_and_store_documents
-from application.parser.file.bulk import SimpleDirectoryReader
+from application.parser.file.bulk import SimpleDirectoryReader, get_default_file_extractor
 from application.parser.remote.remote_creator import RemoteCreator
 from application.parser.schema.base import Document
 from application.retriever.retriever_creator import RetrieverCreator
@@ -1042,10 +1042,16 @@ def attachment_worker(self, file_info, user):
            state="PROGRESS", meta={"current": 30, "status": "Processing content"}
        )

+        file_extractor = get_default_file_extractor(
+            ocr_enabled=settings.DOCLING_OCR_ATTACHMENTS_ENABLED
+        )
        content = storage.process_file(
            relative_path,
            lambda local_path, **kwargs: SimpleDirectoryReader(
-                input_files=[local_path], exclude_hidden=True, errors="ignore"
+                input_files=[local_path],
+                exclude_hidden=True,
+                errors="ignore",
+                file_extractor=file_extractor,
            )
            .load_data()[0]
            .text,