mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-01-20 14:00:55 +00:00
feat: enable OCR for docling when parsing attachments and update file extractor (#2246)
This commit is contained in:
@@ -43,6 +43,7 @@ class Settings(BaseSettings):
|
||||
PARSE_PDF_AS_IMAGE: bool = False
|
||||
PARSE_IMAGE_REMOTE: bool = False
|
||||
DOCLING_OCR_ENABLED: bool = True # Enable OCR for docling parsers (PDF, images)
|
||||
DOCLING_OCR_ATTACHMENTS_ENABLED: bool = False # Enable OCR for docling when parsing attachments
|
||||
VECTOR_STORE: str = (
|
||||
"faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" or "pgvector"
|
||||
)
|
||||
|
||||
@@ -19,7 +19,9 @@ from application.utils import num_tokens_from_string
|
||||
from application.core.settings import settings
|
||||
|
||||
|
||||
def get_default_file_extractor() -> Dict[str, BaseParser]:
|
||||
def get_default_file_extractor(
|
||||
ocr_enabled: Optional[bool] = None,
|
||||
) -> Dict[str, BaseParser]:
|
||||
"""Get the default file extractor.
|
||||
|
||||
Uses docling parsers by default for advanced document processing.
|
||||
@@ -38,7 +40,8 @@ def get_default_file_extractor() -> Dict[str, BaseParser]:
|
||||
DoclingVTTParser,
|
||||
DoclingXMLParser,
|
||||
)
|
||||
ocr_enabled = settings.DOCLING_OCR_ENABLED
|
||||
if ocr_enabled is None:
|
||||
ocr_enabled = settings.DOCLING_OCR_ENABLED
|
||||
return {
|
||||
# Documents
|
||||
".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled),
|
||||
@@ -320,4 +323,4 @@ class SimpleDirectoryReader(BaseReader):
|
||||
|
||||
return result
|
||||
|
||||
return build_tree(Path(base_path))
|
||||
return build_tree(Path(base_path))
|
||||
|
||||
@@ -25,7 +25,7 @@ from application.core.settings import settings
|
||||
from application.parser.chunking import Chunker
|
||||
from application.parser.connectors.connector_creator import ConnectorCreator
|
||||
from application.parser.embedding_pipeline import embed_and_store_documents
|
||||
from application.parser.file.bulk import SimpleDirectoryReader
|
||||
from application.parser.file.bulk import SimpleDirectoryReader, get_default_file_extractor
|
||||
from application.parser.remote.remote_creator import RemoteCreator
|
||||
from application.parser.schema.base import Document
|
||||
from application.retriever.retriever_creator import RetrieverCreator
|
||||
@@ -1042,10 +1042,16 @@ def attachment_worker(self, file_info, user):
|
||||
state="PROGRESS", meta={"current": 30, "status": "Processing content"}
|
||||
)
|
||||
|
||||
file_extractor = get_default_file_extractor(
|
||||
ocr_enabled=settings.DOCLING_OCR_ATTACHMENTS_ENABLED
|
||||
)
|
||||
content = storage.process_file(
|
||||
relative_path,
|
||||
lambda local_path, **kwargs: SimpleDirectoryReader(
|
||||
input_files=[local_path], exclude_hidden=True, errors="ignore"
|
||||
input_files=[local_path],
|
||||
exclude_hidden=True,
|
||||
errors="ignore",
|
||||
file_extractor=file_extractor,
|
||||
)
|
||||
.load_data()[0]
|
||||
.text,
|
||||
|
||||
Reference in New Issue
Block a user