mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-02-13 09:40:48 +00:00
feat: enable OCR for docling when parsing attachments and update file extractor (#2246)
This commit is contained in:
@@ -19,7 +19,9 @@ from application.utils import num_tokens_from_string
|
||||
from application.core.settings import settings
|
||||
|
||||
|
||||
def get_default_file_extractor() -> Dict[str, BaseParser]:
|
||||
def get_default_file_extractor(
|
||||
ocr_enabled: Optional[bool] = None,
|
||||
) -> Dict[str, BaseParser]:
|
||||
"""Get the default file extractor.
|
||||
|
||||
Uses docling parsers by default for advanced document processing.
|
||||
@@ -38,7 +40,8 @@ def get_default_file_extractor() -> Dict[str, BaseParser]:
|
||||
DoclingVTTParser,
|
||||
DoclingXMLParser,
|
||||
)
|
||||
ocr_enabled = settings.DOCLING_OCR_ENABLED
|
||||
if ocr_enabled is None:
|
||||
ocr_enabled = settings.DOCLING_OCR_ENABLED
|
||||
return {
|
||||
# Documents
|
||||
".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled),
|
||||
@@ -320,4 +323,4 @@ class SimpleDirectoryReader(BaseReader):
|
||||
|
||||
return result
|
||||
|
||||
return build_tree(Path(base_path))
|
||||
return build_tree(Path(base_path))
|
||||
|
||||
Reference in New Issue
Block a user