From 05c835ed02c5d176c21d2d4ca9bea75aaf6c1c92 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 31 Dec 2025 00:08:49 +0000 Subject: [PATCH] feat: enable OCR for docling when parsing attachments and update file extractor (#2246) --- application/core/settings.py | 1 + application/parser/file/bulk.py | 9 ++++++--- application/worker.py | 10 ++++++++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/application/core/settings.py b/application/core/settings.py index 3dde21aa..49f7720e 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -43,6 +43,7 @@ class Settings(BaseSettings): PARSE_PDF_AS_IMAGE: bool = False PARSE_IMAGE_REMOTE: bool = False DOCLING_OCR_ENABLED: bool = True # Enable OCR for docling parsers (PDF, images) + DOCLING_OCR_ATTACHMENTS_ENABLED: bool = False # Enable OCR for docling when parsing attachments VECTOR_STORE: str = ( "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" or "pgvector" ) diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index dc12a4dd..64860c0c 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -19,7 +19,9 @@ from application.utils import num_tokens_from_string from application.core.settings import settings -def get_default_file_extractor() -> Dict[str, BaseParser]: +def get_default_file_extractor( + ocr_enabled: Optional[bool] = None, +) -> Dict[str, BaseParser]: """Get the default file extractor. Uses docling parsers by default for advanced document processing. @@ -38,7 +40,8 @@ def get_default_file_extractor() -> Dict[str, BaseParser]: DoclingVTTParser, DoclingXMLParser, ) - ocr_enabled = settings.DOCLING_OCR_ENABLED + if ocr_enabled is None: + ocr_enabled = settings.DOCLING_OCR_ENABLED return { # Documents ".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled), @@ -320,4 +323,4 @@ class SimpleDirectoryReader(BaseReader): return result - return build_tree(Path(base_path)) \ No newline at end of file + return build_tree(Path(base_path)) diff --git a/application/worker.py b/application/worker.py index 44668247..1fa39e3c 100755 --- a/application/worker.py +++ b/application/worker.py @@ -25,7 +25,7 @@ from application.core.settings import settings from application.parser.chunking import Chunker from application.parser.connectors.connector_creator import ConnectorCreator from application.parser.embedding_pipeline import embed_and_store_documents -from application.parser.file.bulk import SimpleDirectoryReader +from application.parser.file.bulk import SimpleDirectoryReader, get_default_file_extractor from application.parser.remote.remote_creator import RemoteCreator from application.parser.schema.base import Document from application.retriever.retriever_creator import RetrieverCreator @@ -1042,10 +1042,16 @@ def attachment_worker(self, file_info, user): state="PROGRESS", meta={"current": 30, "status": "Processing content"} ) + file_extractor = get_default_file_extractor( + ocr_enabled=settings.DOCLING_OCR_ATTACHMENTS_ENABLED + ) content = storage.process_file( relative_path, lambda local_path, **kwargs: SimpleDirectoryReader( - input_files=[local_path], exclude_hidden=True, errors="ignore" + input_files=[local_path], + exclude_hidden=True, + errors="ignore", + file_extractor=file_extractor, ) .load_data()[0] .text,