mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-02-03 04:44:10 +00:00
feat: implement Docling parsers (#2202)
* feat: implement Docling parsers * fix office * docling-ocr-fix * Docling smart ocr * ruff fix --------- Co-authored-by: Pavel <pabin@yandex.ru>
This commit is contained in:
@@ -48,7 +48,11 @@ FROM ubuntu:24.04 as final
|
||||
RUN apt-get update && \
|
||||
apt-get install -y software-properties-common && \
|
||||
add-apt-repository ppa:deadsnakes/ppa && \
|
||||
apt-get update && apt-get install -y --no-install-recommends python3.12 && \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3.12 \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
&& \
|
||||
ln -s /usr/bin/python3.12 /usr/bin/python && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
@@ -76,7 +76,12 @@ class UploadFile(Resource):
|
||||
temp_file_path = os.path.join(temp_dir, safe_file)
|
||||
file.save(temp_file_path)
|
||||
|
||||
if zipfile.is_zipfile(temp_file_path):
|
||||
# Only extract actual .zip files, not Office formats (.docx, .xlsx, .pptx)
|
||||
# which are technically zip archives but should be processed as-is
|
||||
is_office_format = safe_file.lower().endswith(
|
||||
(".docx", ".xlsx", ".pptx", ".odt", ".ods", ".odp", ".epub")
|
||||
)
|
||||
if zipfile.is_zipfile(temp_file_path) and not is_office_format:
|
||||
try:
|
||||
with zipfile.ZipFile(temp_file_path, "r") as zip_ref:
|
||||
zip_ref.extractall(path=temp_dir)
|
||||
|
||||
@@ -42,6 +42,7 @@ class Settings(BaseSettings):
|
||||
UPLOAD_FOLDER: str = "inputs"
|
||||
PARSE_PDF_AS_IMAGE: bool = False
|
||||
PARSE_IMAGE_REMOTE: bool = False
|
||||
DOCLING_OCR_ENABLED: bool = True # Enable OCR for docling parsers (PDF, images)
|
||||
VECTOR_STORE: str = (
|
||||
"faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" or "pgvector"
|
||||
)
|
||||
|
||||
@@ -65,6 +65,10 @@ def embed_and_store_documents(docs: List[Any], folder_name: str, source_id: str,
|
||||
if not os.path.exists(folder_name):
|
||||
os.makedirs(folder_name)
|
||||
|
||||
# Validate docs is not empty
|
||||
if not docs:
|
||||
raise ValueError("No documents to embed - check file format and extension")
|
||||
|
||||
# Initialize vector store
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
docs_init = [docs.pop(0)]
|
||||
|
||||
@@ -10,29 +10,94 @@ from application.parser.file.epub_parser import EpubParser
|
||||
from application.parser.file.html_parser import HTMLParser
|
||||
from application.parser.file.markdown_parser import MarkdownParser
|
||||
from application.parser.file.rst_parser import RstParser
|
||||
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
|
||||
from application.parser.file.tabular_parser import PandasCSVParser, ExcelParser
|
||||
from application.parser.file.json_parser import JSONParser
|
||||
from application.parser.file.pptx_parser import PPTXParser
|
||||
from application.parser.file.image_parser import ImageParser
|
||||
from application.parser.schema.base import Document
|
||||
from application.utils import num_tokens_from_string
|
||||
from application.core.settings import settings
|
||||
|
||||
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
||||
".pdf": PDFParser(),
|
||||
".docx": DocxParser(),
|
||||
".csv": PandasCSVParser(),
|
||||
".xlsx":ExcelParser(),
|
||||
".epub": EpubParser(),
|
||||
".md": MarkdownParser(),
|
||||
".rst": RstParser(),
|
||||
".html": HTMLParser(),
|
||||
".mdx": MarkdownParser(),
|
||||
".json":JSONParser(),
|
||||
".pptx":PPTXParser(),
|
||||
".png": ImageParser(),
|
||||
".jpg": ImageParser(),
|
||||
".jpeg": ImageParser(),
|
||||
}
|
||||
|
||||
def get_default_file_extractor() -> Dict[str, BaseParser]:
|
||||
"""Get the default file extractor.
|
||||
|
||||
Uses docling parsers by default for advanced document processing.
|
||||
Falls back to standard parsers if docling is not installed.
|
||||
"""
|
||||
try:
|
||||
from application.parser.file.docling_parser import (
|
||||
DoclingPDFParser,
|
||||
DoclingDocxParser,
|
||||
DoclingPPTXParser,
|
||||
DoclingXLSXParser,
|
||||
DoclingHTMLParser,
|
||||
DoclingImageParser,
|
||||
DoclingCSVParser,
|
||||
DoclingAsciiDocParser,
|
||||
DoclingVTTParser,
|
||||
DoclingXMLParser,
|
||||
)
|
||||
ocr_enabled = settings.DOCLING_OCR_ENABLED
|
||||
return {
|
||||
# Documents
|
||||
".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled),
|
||||
".docx": DoclingDocxParser(),
|
||||
".pptx": DoclingPPTXParser(),
|
||||
".xlsx": DoclingXLSXParser(),
|
||||
# Web formats
|
||||
".html": DoclingHTMLParser(),
|
||||
".xhtml": DoclingHTMLParser(),
|
||||
# Data formats
|
||||
".csv": DoclingCSVParser(),
|
||||
".json": JSONParser(), # Keep JSON parser (specialized handling)
|
||||
# Text/markup formats
|
||||
".md": MarkdownParser(), # Keep markdown parser (specialized handling)
|
||||
".mdx": MarkdownParser(),
|
||||
".rst": RstParser(),
|
||||
".adoc": DoclingAsciiDocParser(),
|
||||
".asciidoc": DoclingAsciiDocParser(),
|
||||
# Images (with OCR)
|
||||
".png": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".jpg": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".jpeg": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".tiff": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".tif": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".bmp": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".webp": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
# Media/subtitles
|
||||
".vtt": DoclingVTTParser(),
|
||||
# Specialized XML formats
|
||||
".xml": DoclingXMLParser(),
|
||||
# Formats docling doesn't support - use standard parsers
|
||||
".epub": EpubParser(),
|
||||
}
|
||||
except ImportError:
|
||||
logging.warning(
|
||||
"docling is not installed. Using standard parsers. "
|
||||
"For advanced document parsing, install with: pip install docling"
|
||||
)
|
||||
# Fallback to standard parsers
|
||||
return {
|
||||
".pdf": PDFParser(),
|
||||
".docx": DocxParser(),
|
||||
".csv": PandasCSVParser(),
|
||||
".xlsx": ExcelParser(),
|
||||
".epub": EpubParser(),
|
||||
".md": MarkdownParser(),
|
||||
".rst": RstParser(),
|
||||
".html": HTMLParser(),
|
||||
".mdx": MarkdownParser(),
|
||||
".json": JSONParser(),
|
||||
".pptx": PPTXParser(),
|
||||
".png": ImageParser(),
|
||||
".jpg": ImageParser(),
|
||||
".jpeg": ImageParser(),
|
||||
}
|
||||
|
||||
|
||||
# For backwards compatibility
|
||||
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = get_default_file_extractor()
|
||||
|
||||
|
||||
class SimpleDirectoryReader(BaseReader):
|
||||
@@ -83,7 +148,10 @@ class SimpleDirectoryReader(BaseReader):
|
||||
|
||||
self.recursive = recursive
|
||||
self.exclude_hidden = exclude_hidden
|
||||
self.required_exts = required_exts
|
||||
# Normalize extensions to lowercase for case-insensitive matching
|
||||
self.required_exts = (
|
||||
[ext.lower() for ext in required_exts] if required_exts else None
|
||||
)
|
||||
self.num_files_limit = num_files_limit
|
||||
|
||||
if input_files:
|
||||
@@ -112,7 +180,7 @@ class SimpleDirectoryReader(BaseReader):
|
||||
continue
|
||||
elif (
|
||||
self.required_exts is not None
|
||||
and input_file.suffix not in self.required_exts
|
||||
and input_file.suffix.lower() not in self.required_exts
|
||||
):
|
||||
continue
|
||||
else:
|
||||
@@ -149,8 +217,9 @@ class SimpleDirectoryReader(BaseReader):
|
||||
self.file_token_counts = {}
|
||||
|
||||
for input_file in self.input_files:
|
||||
if input_file.suffix in self.file_extractor:
|
||||
parser = self.file_extractor[input_file.suffix]
|
||||
suffix_lower = input_file.suffix.lower()
|
||||
if suffix_lower in self.file_extractor:
|
||||
parser = self.file_extractor[suffix_lower]
|
||||
if not parser.parser_config_set:
|
||||
parser.init_parser()
|
||||
data = parser.parse_file(input_file, errors=self.errors)
|
||||
@@ -232,7 +301,7 @@ class SimpleDirectoryReader(BaseReader):
|
||||
if subtree:
|
||||
result[item.name] = subtree
|
||||
else:
|
||||
if self.required_exts is not None and item.suffix not in self.required_exts:
|
||||
if self.required_exts is not None and item.suffix.lower() not in self.required_exts:
|
||||
continue
|
||||
|
||||
full_path = str(item.resolve())
|
||||
|
||||
330
application/parser/file/docling_parser.py
Normal file
330
application/parser/file/docling_parser.py
Normal file
@@ -0,0 +1,330 @@
|
||||
"""Docling parser.
|
||||
|
||||
Uses docling library for advanced document parsing with layout detection,
|
||||
table structure recognition, and unified document representation.
|
||||
|
||||
Supports: PDF, DOCX, PPTX, XLSX, HTML, XHTML, CSV, Markdown, AsciiDoc,
|
||||
images (PNG, JPEG, TIFF, BMP, WEBP), WebVTT, and specialized XML formats.
|
||||
"""
|
||||
import importlib.util
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from application.parser.file.base_parser import BaseParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DoclingParser(BaseParser):
|
||||
"""Parser using docling for advanced document processing.
|
||||
|
||||
Docling provides:
|
||||
- Advanced PDF layout analysis
|
||||
- Table structure recognition
|
||||
- Reading order detection
|
||||
- OCR for scanned documents (supports RapidOCR)
|
||||
- Unified DoclingDocument format
|
||||
- Export to Markdown
|
||||
|
||||
Uses hybrid OCR approach by default:
|
||||
- Text regions: Direct PDF text extraction (fast)
|
||||
- Bitmap/image regions: OCR only these areas (smart)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ocr_enabled: bool = True,
|
||||
table_structure: bool = True,
|
||||
export_format: str = "markdown",
|
||||
use_rapidocr: bool = True,
|
||||
ocr_languages: Optional[List[str]] = None,
|
||||
force_full_page_ocr: bool = False,
|
||||
):
|
||||
"""Initialize DoclingParser.
|
||||
|
||||
Args:
|
||||
ocr_enabled: Enable OCR for bitmap/image regions in documents
|
||||
table_structure: Enable table structure recognition
|
||||
export_format: Output format ('markdown', 'text', 'html')
|
||||
use_rapidocr: Use RapidOCR engine (default True, works well in Docker)
|
||||
ocr_languages: List of OCR languages (default: ['english'])
|
||||
force_full_page_ocr: Force OCR on entire page (False = smart hybrid OCR)
|
||||
"""
|
||||
super().__init__()
|
||||
self.ocr_enabled = ocr_enabled
|
||||
self.table_structure = table_structure
|
||||
self.export_format = export_format
|
||||
self.use_rapidocr = use_rapidocr
|
||||
self.ocr_languages = ocr_languages or ["english"]
|
||||
self.force_full_page_ocr = force_full_page_ocr
|
||||
self._converter = None
|
||||
|
||||
def _create_converter(self):
|
||||
"""Create a docling converter with hybrid OCR configuration.
|
||||
|
||||
Uses smart OCR approach:
|
||||
- When ocr_enabled=True and force_full_page_ocr=False (default):
|
||||
Layout model detects text vs bitmap regions, OCR only runs on bitmaps
|
||||
- When ocr_enabled=True and force_full_page_ocr=True:
|
||||
OCR runs on entire page (for scanned documents/images)
|
||||
- When ocr_enabled=False:
|
||||
No OCR, only native text extraction
|
||||
|
||||
Returns:
|
||||
DocumentConverter instance
|
||||
"""
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
ImageFormatOption,
|
||||
InputFormat,
|
||||
PdfFormatOption,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=self.ocr_enabled,
|
||||
do_table_structure=self.table_structure,
|
||||
)
|
||||
|
||||
if self.ocr_enabled:
|
||||
ocr_options = self._get_ocr_options()
|
||||
if ocr_options is not None:
|
||||
pipeline_options.ocr_options = ocr_options
|
||||
|
||||
return DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
),
|
||||
InputFormat.IMAGE: ImageFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
def _init_parser(self) -> Dict:
|
||||
"""Initialize the docling converter with hybrid OCR."""
|
||||
logger.info("Initializing DoclingParser...")
|
||||
logger.info(f" ocr_enabled={self.ocr_enabled}")
|
||||
logger.info(f" force_full_page_ocr={self.force_full_page_ocr}")
|
||||
logger.info(f" use_rapidocr={self.use_rapidocr}")
|
||||
|
||||
if importlib.util.find_spec("docling.document_converter") is None:
|
||||
raise ImportError(
|
||||
"docling is required for DoclingParser. "
|
||||
"Install it with: pip install docling"
|
||||
)
|
||||
|
||||
# Create converter with hybrid OCR (smart: text direct, bitmaps OCR'd)
|
||||
self._converter = self._create_converter()
|
||||
|
||||
logger.info("DoclingParser initialized successfully")
|
||||
return {
|
||||
"ocr_enabled": self.ocr_enabled,
|
||||
"table_structure": self.table_structure,
|
||||
"export_format": self.export_format,
|
||||
"use_rapidocr": self.use_rapidocr,
|
||||
"ocr_languages": self.ocr_languages,
|
||||
"force_full_page_ocr": self.force_full_page_ocr,
|
||||
}
|
||||
|
||||
def _get_ocr_options(self):
|
||||
"""Get OCR options based on configuration.
|
||||
|
||||
Returns RapidOcrOptions if use_rapidocr is True and available,
|
||||
otherwise returns None to use docling defaults.
|
||||
"""
|
||||
if not self.use_rapidocr:
|
||||
return None
|
||||
|
||||
try:
|
||||
from docling.datamodel.pipeline_options import RapidOcrOptions
|
||||
|
||||
return RapidOcrOptions(
|
||||
lang=self.ocr_languages,
|
||||
force_full_page_ocr=self.force_full_page_ocr,
|
||||
)
|
||||
except ImportError as e:
|
||||
logger.warning(f"Failed to import RapidOcrOptions: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating RapidOcrOptions: {e}")
|
||||
return None
|
||||
|
||||
def _export_content(self, document) -> str:
|
||||
"""Export document content in the configured format.
|
||||
|
||||
Handles edge case where text is nested under picture elements (e.g., OCR'd
|
||||
images). If the standard export returns minimal content but document.texts
|
||||
contains extracted text, falls back to direct text extraction.
|
||||
"""
|
||||
if self.export_format == "markdown":
|
||||
content = document.export_to_markdown()
|
||||
elif self.export_format == "html":
|
||||
content = document.export_to_html()
|
||||
else:
|
||||
content = document.export_to_text()
|
||||
|
||||
# Handle case where text is nested under pictures (common with OCR'd images)
|
||||
# Standard exports may return just "<!-- image -->" while actual text exists
|
||||
stripped_content = content.strip()
|
||||
is_minimal = len(stripped_content) < 50 or stripped_content == "<!-- image -->"
|
||||
|
||||
if is_minimal and hasattr(document, "texts") and document.texts:
|
||||
# Extract text directly from document.texts
|
||||
extracted_texts = [t.text for t in document.texts if t.text]
|
||||
if extracted_texts:
|
||||
logger.info(
|
||||
f"Standard export minimal ({len(stripped_content)} chars), "
|
||||
f"extracting {len(extracted_texts)} texts directly"
|
||||
)
|
||||
return "\n\n".join(extracted_texts)
|
||||
|
||||
return content
|
||||
|
||||
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
|
||||
"""Parse file using docling with hybrid OCR.
|
||||
|
||||
Uses smart OCR approach where the layout model detects text vs bitmap
|
||||
regions. Text is extracted directly, bitmaps are OCR'd only when needed.
|
||||
|
||||
Args:
|
||||
file: Path to the file to parse
|
||||
errors: Error handling mode (ignored, docling handles internally)
|
||||
|
||||
Returns:
|
||||
Parsed document content as markdown string
|
||||
"""
|
||||
logger.info(f"parse_file called for: {file}")
|
||||
|
||||
if self._converter is None:
|
||||
self._init_parser()
|
||||
|
||||
try:
|
||||
logger.info(f"Converting file with hybrid OCR: {file}")
|
||||
result = self._converter.convert(str(file))
|
||||
content = self._export_content(result.document)
|
||||
logger.info(f"Parse complete, content length: {len(content)} chars")
|
||||
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing file with docling: {e}", exc_info=True)
|
||||
if errors == "ignore":
|
||||
return f"[Error parsing file with docling: {str(e)}]"
|
||||
raise
|
||||
|
||||
|
||||
class DoclingPDFParser(DoclingParser):
|
||||
"""Docling-based PDF parser with advanced features and RapidOCR support.
|
||||
|
||||
Uses hybrid OCR approach by default:
|
||||
- Text regions: Direct PDF text extraction (fast)
|
||||
- Bitmap/image regions: OCR only these areas (smart)
|
||||
|
||||
Set force_full_page_ocr=True only for fully scanned documents.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ocr_enabled: bool = True,
|
||||
table_structure: bool = True,
|
||||
use_rapidocr: bool = True,
|
||||
ocr_languages: Optional[List[str]] = None,
|
||||
force_full_page_ocr: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
ocr_enabled=ocr_enabled,
|
||||
table_structure=table_structure,
|
||||
export_format="markdown",
|
||||
use_rapidocr=use_rapidocr,
|
||||
ocr_languages=ocr_languages,
|
||||
force_full_page_ocr=force_full_page_ocr,
|
||||
)
|
||||
|
||||
|
||||
class DoclingDocxParser(DoclingParser):
|
||||
"""Docling-based DOCX parser."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(export_format="markdown")
|
||||
|
||||
|
||||
class DoclingPPTXParser(DoclingParser):
|
||||
"""Docling-based PPTX parser."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(export_format="markdown")
|
||||
|
||||
|
||||
class DoclingXLSXParser(DoclingParser):
|
||||
"""Docling-based XLSX parser with table structure."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(table_structure=True, export_format="markdown")
|
||||
|
||||
|
||||
class DoclingHTMLParser(DoclingParser):
|
||||
"""Docling-based HTML parser."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(export_format="markdown")
|
||||
|
||||
|
||||
class DoclingImageParser(DoclingParser):
|
||||
"""Docling-based image parser with OCR and RapidOCR support.
|
||||
|
||||
For images, force_full_page_ocr=True is used since images are entirely
|
||||
visual and require full OCR to extract any text.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ocr_enabled: bool = True,
|
||||
use_rapidocr: bool = True,
|
||||
ocr_languages: Optional[List[str]] = None,
|
||||
force_full_page_ocr: bool = True,
|
||||
):
|
||||
super().__init__(
|
||||
ocr_enabled=ocr_enabled,
|
||||
export_format="markdown",
|
||||
use_rapidocr=use_rapidocr,
|
||||
ocr_languages=ocr_languages,
|
||||
force_full_page_ocr=force_full_page_ocr,
|
||||
)
|
||||
|
||||
|
||||
class DoclingCSVParser(DoclingParser):
|
||||
"""Docling-based CSV parser."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(table_structure=True, export_format="markdown")
|
||||
|
||||
|
||||
class DoclingMarkdownParser(DoclingParser):
|
||||
"""Docling-based Markdown parser."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(export_format="markdown")
|
||||
|
||||
|
||||
class DoclingAsciiDocParser(DoclingParser):
|
||||
"""Docling-based AsciiDoc parser."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(export_format="markdown")
|
||||
|
||||
|
||||
class DoclingVTTParser(DoclingParser):
|
||||
"""Docling-based WebVTT (video text tracks) parser."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(export_format="markdown")
|
||||
|
||||
|
||||
class DoclingXMLParser(DoclingParser):
|
||||
"""Docling-based XML parser (USPTO, JATS)."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(export_format="markdown")
|
||||
@@ -5,6 +5,8 @@ celery==5.6.0
|
||||
cryptography==46.0.3
|
||||
dataclasses-json==0.6.7
|
||||
docling>=2.16.0
|
||||
rapidocr>=1.4.0
|
||||
onnxruntime>=1.19.0
|
||||
docx2txt==0.8
|
||||
duckduckgo-search==8.1.1
|
||||
ebooklib==0.20
|
||||
|
||||
Reference in New Issue
Block a user