From 05c835ed02c5d176c21d2d4ca9bea75aaf6c1c92 Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Wed, 31 Dec 2025 00:08:49 +0000
Subject: [PATCH] feat: enable OCR for docling when parsing attachments and
 update file extractor (#2246)

---
 application/core/settings.py    |  1 +
 application/parser/file/bulk.py |  9 ++++++---
 application/worker.py           | 10 ++++++++--
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/application/core/settings.py b/application/core/settings.py
index 3dde21aa..49f7720e 100644
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -43,6 +43,7 @@ class Settings(BaseSettings):
     PARSE_PDF_AS_IMAGE: bool = False
     PARSE_IMAGE_REMOTE: bool = False
     DOCLING_OCR_ENABLED: bool = True  # Enable OCR for docling parsers (PDF, images)
+    DOCLING_OCR_ATTACHMENTS_ENABLED: bool = False  # Enable OCR for docling when parsing attachments
     VECTOR_STORE: str = (
         "faiss"  #  "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" or "pgvector"
     )
diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py
index dc12a4dd..64860c0c 100644
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -19,7 +19,9 @@ from application.utils import num_tokens_from_string
 from application.core.settings import settings
 
 
-def get_default_file_extractor() -> Dict[str, BaseParser]:
+def get_default_file_extractor(
+    ocr_enabled: Optional[bool] = None,
+) -> Dict[str, BaseParser]:
     """Get the default file extractor.
 
     Uses docling parsers by default for advanced document processing.
@@ -38,7 +40,8 @@ def get_default_file_extractor() -> Dict[str, BaseParser]:
             DoclingVTTParser,
             DoclingXMLParser,
         )
-        ocr_enabled = settings.DOCLING_OCR_ENABLED
+        if ocr_enabled is None:
+            ocr_enabled = settings.DOCLING_OCR_ENABLED
         return {
             # Documents
             ".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled),
@@ -320,4 +323,4 @@ class SimpleDirectoryReader(BaseReader):
                     
             return result
         
-        return build_tree(Path(base_path))
\ No newline at end of file
+        return build_tree(Path(base_path))
diff --git a/application/worker.py b/application/worker.py
index 44668247..1fa39e3c 100755
--- a/application/worker.py
+++ b/application/worker.py
@@ -25,7 +25,7 @@ from application.core.settings import settings
 from application.parser.chunking import Chunker
 from application.parser.connectors.connector_creator import ConnectorCreator
 from application.parser.embedding_pipeline import embed_and_store_documents
-from application.parser.file.bulk import SimpleDirectoryReader
+from application.parser.file.bulk import SimpleDirectoryReader, get_default_file_extractor
 from application.parser.remote.remote_creator import RemoteCreator
 from application.parser.schema.base import Document
 from application.retriever.retriever_creator import RetrieverCreator
@@ -1042,10 +1042,16 @@ def attachment_worker(self, file_info, user):
             state="PROGRESS", meta={"current": 30, "status": "Processing content"}
         )
 
+        file_extractor = get_default_file_extractor(
+            ocr_enabled=settings.DOCLING_OCR_ATTACHMENTS_ENABLED
+        )
         content = storage.process_file(
             relative_path,
             lambda local_path, **kwargs: SimpleDirectoryReader(
-                input_files=[local_path], exclude_hidden=True, errors="ignore"
+                input_files=[local_path],
+                exclude_hidden=True,
+                errors="ignore",
+                file_extractor=file_extractor,
             )
             .load_data()[0]
             .text,