feat: enable OCR for docling when parsing attachments and update file extractor (#2246)

This commit is contained in:
Alex
2025-12-31 00:08:49 +00:00
committed by GitHub
parent 9e7f1ad1c0
commit 05c835ed02
3 changed files with 15 additions and 5 deletions

View File

@@ -43,6 +43,7 @@ class Settings(BaseSettings):
PARSE_PDF_AS_IMAGE: bool = False
PARSE_IMAGE_REMOTE: bool = False
DOCLING_OCR_ENABLED: bool = True # Enable OCR for docling parsers (PDF, images)
DOCLING_OCR_ATTACHMENTS_ENABLED: bool = False # Enable OCR for docling when parsing attachments
VECTOR_STORE: str = (
"faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" or "pgvector"
)

View File

@@ -19,7 +19,9 @@ from application.utils import num_tokens_from_string
from application.core.settings import settings
def get_default_file_extractor() -> Dict[str, BaseParser]:
def get_default_file_extractor(
ocr_enabled: Optional[bool] = None,
) -> Dict[str, BaseParser]:
"""Get the default file extractor.
Uses docling parsers by default for advanced document processing.
@@ -38,7 +40,8 @@ def get_default_file_extractor() -> Dict[str, BaseParser]:
DoclingVTTParser,
DoclingXMLParser,
)
ocr_enabled = settings.DOCLING_OCR_ENABLED
if ocr_enabled is None:
ocr_enabled = settings.DOCLING_OCR_ENABLED
return {
# Documents
".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled),
@@ -320,4 +323,4 @@ class SimpleDirectoryReader(BaseReader):
return result
return build_tree(Path(base_path))
return build_tree(Path(base_path))

View File

@@ -25,7 +25,7 @@ from application.core.settings import settings
from application.parser.chunking import Chunker
from application.parser.connectors.connector_creator import ConnectorCreator
from application.parser.embedding_pipeline import embed_and_store_documents
from application.parser.file.bulk import SimpleDirectoryReader
from application.parser.file.bulk import SimpleDirectoryReader, get_default_file_extractor
from application.parser.remote.remote_creator import RemoteCreator
from application.parser.schema.base import Document
from application.retriever.retriever_creator import RetrieverCreator
@@ -1042,10 +1042,16 @@ def attachment_worker(self, file_info, user):
state="PROGRESS", meta={"current": 30, "status": "Processing content"}
)
file_extractor = get_default_file_extractor(
ocr_enabled=settings.DOCLING_OCR_ATTACHMENTS_ENABLED
)
content = storage.process_file(
relative_path,
lambda local_path, **kwargs: SimpleDirectoryReader(
input_files=[local_path], exclude_hidden=True, errors="ignore"
input_files=[local_path],
exclude_hidden=True,
errors="ignore",
file_extractor=file_extractor,
)
.load_data()[0]
.text,