mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-01-20 05:50:58 +00:00
* feat: implement Docling parsers * fix office * docling-ocr-fix * Docling smart ocr * ruff fix --------- Co-authored-by: Pavel <pabin@yandex.ru>
331 lines
11 KiB
Python
331 lines
11 KiB
Python
"""Docling parser.
|
|
|
|
Uses docling library for advanced document parsing with layout detection,
|
|
table structure recognition, and unified document representation.
|
|
|
|
Supports: PDF, DOCX, PPTX, XLSX, HTML, XHTML, CSV, Markdown, AsciiDoc,
|
|
images (PNG, JPEG, TIFF, BMP, WEBP), WebVTT, and specialized XML formats.
|
|
"""
|
|
import importlib.util
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Union
|
|
|
|
from application.parser.file.base_parser import BaseParser
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DoclingParser(BaseParser):
|
|
"""Parser using docling for advanced document processing.
|
|
|
|
Docling provides:
|
|
- Advanced PDF layout analysis
|
|
- Table structure recognition
|
|
- Reading order detection
|
|
- OCR for scanned documents (supports RapidOCR)
|
|
- Unified DoclingDocument format
|
|
- Export to Markdown
|
|
|
|
Uses hybrid OCR approach by default:
|
|
- Text regions: Direct PDF text extraction (fast)
|
|
- Bitmap/image regions: OCR only these areas (smart)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
ocr_enabled: bool = True,
|
|
table_structure: bool = True,
|
|
export_format: str = "markdown",
|
|
use_rapidocr: bool = True,
|
|
ocr_languages: Optional[List[str]] = None,
|
|
force_full_page_ocr: bool = False,
|
|
):
|
|
"""Initialize DoclingParser.
|
|
|
|
Args:
|
|
ocr_enabled: Enable OCR for bitmap/image regions in documents
|
|
table_structure: Enable table structure recognition
|
|
export_format: Output format ('markdown', 'text', 'html')
|
|
use_rapidocr: Use RapidOCR engine (default True, works well in Docker)
|
|
ocr_languages: List of OCR languages (default: ['english'])
|
|
force_full_page_ocr: Force OCR on entire page (False = smart hybrid OCR)
|
|
"""
|
|
super().__init__()
|
|
self.ocr_enabled = ocr_enabled
|
|
self.table_structure = table_structure
|
|
self.export_format = export_format
|
|
self.use_rapidocr = use_rapidocr
|
|
self.ocr_languages = ocr_languages or ["english"]
|
|
self.force_full_page_ocr = force_full_page_ocr
|
|
self._converter = None
|
|
|
|
def _create_converter(self):
|
|
"""Create a docling converter with hybrid OCR configuration.
|
|
|
|
Uses smart OCR approach:
|
|
- When ocr_enabled=True and force_full_page_ocr=False (default):
|
|
Layout model detects text vs bitmap regions, OCR only runs on bitmaps
|
|
- When ocr_enabled=True and force_full_page_ocr=True:
|
|
OCR runs on entire page (for scanned documents/images)
|
|
- When ocr_enabled=False:
|
|
No OCR, only native text extraction
|
|
|
|
Returns:
|
|
DocumentConverter instance
|
|
"""
|
|
from docling.document_converter import (
|
|
DocumentConverter,
|
|
ImageFormatOption,
|
|
InputFormat,
|
|
PdfFormatOption,
|
|
)
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
|
|
pipeline_options = PdfPipelineOptions(
|
|
do_ocr=self.ocr_enabled,
|
|
do_table_structure=self.table_structure,
|
|
)
|
|
|
|
if self.ocr_enabled:
|
|
ocr_options = self._get_ocr_options()
|
|
if ocr_options is not None:
|
|
pipeline_options.ocr_options = ocr_options
|
|
|
|
return DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_options=pipeline_options,
|
|
),
|
|
InputFormat.IMAGE: ImageFormatOption(
|
|
pipeline_options=pipeline_options,
|
|
),
|
|
}
|
|
)
|
|
|
|
def _init_parser(self) -> Dict:
|
|
"""Initialize the docling converter with hybrid OCR."""
|
|
logger.info("Initializing DoclingParser...")
|
|
logger.info(f" ocr_enabled={self.ocr_enabled}")
|
|
logger.info(f" force_full_page_ocr={self.force_full_page_ocr}")
|
|
logger.info(f" use_rapidocr={self.use_rapidocr}")
|
|
|
|
if importlib.util.find_spec("docling.document_converter") is None:
|
|
raise ImportError(
|
|
"docling is required for DoclingParser. "
|
|
"Install it with: pip install docling"
|
|
)
|
|
|
|
# Create converter with hybrid OCR (smart: text direct, bitmaps OCR'd)
|
|
self._converter = self._create_converter()
|
|
|
|
logger.info("DoclingParser initialized successfully")
|
|
return {
|
|
"ocr_enabled": self.ocr_enabled,
|
|
"table_structure": self.table_structure,
|
|
"export_format": self.export_format,
|
|
"use_rapidocr": self.use_rapidocr,
|
|
"ocr_languages": self.ocr_languages,
|
|
"force_full_page_ocr": self.force_full_page_ocr,
|
|
}
|
|
|
|
def _get_ocr_options(self):
|
|
"""Get OCR options based on configuration.
|
|
|
|
Returns RapidOcrOptions if use_rapidocr is True and available,
|
|
otherwise returns None to use docling defaults.
|
|
"""
|
|
if not self.use_rapidocr:
|
|
return None
|
|
|
|
try:
|
|
from docling.datamodel.pipeline_options import RapidOcrOptions
|
|
|
|
return RapidOcrOptions(
|
|
lang=self.ocr_languages,
|
|
force_full_page_ocr=self.force_full_page_ocr,
|
|
)
|
|
except ImportError as e:
|
|
logger.warning(f"Failed to import RapidOcrOptions: {e}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error creating RapidOcrOptions: {e}")
|
|
return None
|
|
|
|
def _export_content(self, document) -> str:
|
|
"""Export document content in the configured format.
|
|
|
|
Handles edge case where text is nested under picture elements (e.g., OCR'd
|
|
images). If the standard export returns minimal content but document.texts
|
|
contains extracted text, falls back to direct text extraction.
|
|
"""
|
|
if self.export_format == "markdown":
|
|
content = document.export_to_markdown()
|
|
elif self.export_format == "html":
|
|
content = document.export_to_html()
|
|
else:
|
|
content = document.export_to_text()
|
|
|
|
# Handle case where text is nested under pictures (common with OCR'd images)
|
|
# Standard exports may return just "<!-- image -->" while actual text exists
|
|
stripped_content = content.strip()
|
|
is_minimal = len(stripped_content) < 50 or stripped_content == "<!-- image -->"
|
|
|
|
if is_minimal and hasattr(document, "texts") and document.texts:
|
|
# Extract text directly from document.texts
|
|
extracted_texts = [t.text for t in document.texts if t.text]
|
|
if extracted_texts:
|
|
logger.info(
|
|
f"Standard export minimal ({len(stripped_content)} chars), "
|
|
f"extracting {len(extracted_texts)} texts directly"
|
|
)
|
|
return "\n\n".join(extracted_texts)
|
|
|
|
return content
|
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
|
|
"""Parse file using docling with hybrid OCR.
|
|
|
|
Uses smart OCR approach where the layout model detects text vs bitmap
|
|
regions. Text is extracted directly, bitmaps are OCR'd only when needed.
|
|
|
|
Args:
|
|
file: Path to the file to parse
|
|
errors: Error handling mode (ignored, docling handles internally)
|
|
|
|
Returns:
|
|
Parsed document content as markdown string
|
|
"""
|
|
logger.info(f"parse_file called for: {file}")
|
|
|
|
if self._converter is None:
|
|
self._init_parser()
|
|
|
|
try:
|
|
logger.info(f"Converting file with hybrid OCR: {file}")
|
|
result = self._converter.convert(str(file))
|
|
content = self._export_content(result.document)
|
|
logger.info(f"Parse complete, content length: {len(content)} chars")
|
|
|
|
return content
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing file with docling: {e}", exc_info=True)
|
|
if errors == "ignore":
|
|
return f"[Error parsing file with docling: {str(e)}]"
|
|
raise
|
|
|
|
|
|
class DoclingPDFParser(DoclingParser):
|
|
"""Docling-based PDF parser with advanced features and RapidOCR support.
|
|
|
|
Uses hybrid OCR approach by default:
|
|
- Text regions: Direct PDF text extraction (fast)
|
|
- Bitmap/image regions: OCR only these areas (smart)
|
|
|
|
Set force_full_page_ocr=True only for fully scanned documents.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
ocr_enabled: bool = True,
|
|
table_structure: bool = True,
|
|
use_rapidocr: bool = True,
|
|
ocr_languages: Optional[List[str]] = None,
|
|
force_full_page_ocr: bool = False,
|
|
):
|
|
super().__init__(
|
|
ocr_enabled=ocr_enabled,
|
|
table_structure=table_structure,
|
|
export_format="markdown",
|
|
use_rapidocr=use_rapidocr,
|
|
ocr_languages=ocr_languages,
|
|
force_full_page_ocr=force_full_page_ocr,
|
|
)
|
|
|
|
|
|
class DoclingDocxParser(DoclingParser):
|
|
"""Docling-based DOCX parser."""
|
|
|
|
def __init__(self):
|
|
super().__init__(export_format="markdown")
|
|
|
|
|
|
class DoclingPPTXParser(DoclingParser):
|
|
"""Docling-based PPTX parser."""
|
|
|
|
def __init__(self):
|
|
super().__init__(export_format="markdown")
|
|
|
|
|
|
class DoclingXLSXParser(DoclingParser):
|
|
"""Docling-based XLSX parser with table structure."""
|
|
|
|
def __init__(self):
|
|
super().__init__(table_structure=True, export_format="markdown")
|
|
|
|
|
|
class DoclingHTMLParser(DoclingParser):
|
|
"""Docling-based HTML parser."""
|
|
|
|
def __init__(self):
|
|
super().__init__(export_format="markdown")
|
|
|
|
|
|
class DoclingImageParser(DoclingParser):
|
|
"""Docling-based image parser with OCR and RapidOCR support.
|
|
|
|
For images, force_full_page_ocr=True is used since images are entirely
|
|
visual and require full OCR to extract any text.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
ocr_enabled: bool = True,
|
|
use_rapidocr: bool = True,
|
|
ocr_languages: Optional[List[str]] = None,
|
|
force_full_page_ocr: bool = True,
|
|
):
|
|
super().__init__(
|
|
ocr_enabled=ocr_enabled,
|
|
export_format="markdown",
|
|
use_rapidocr=use_rapidocr,
|
|
ocr_languages=ocr_languages,
|
|
force_full_page_ocr=force_full_page_ocr,
|
|
)
|
|
|
|
|
|
class DoclingCSVParser(DoclingParser):
|
|
"""Docling-based CSV parser."""
|
|
|
|
def __init__(self):
|
|
super().__init__(table_structure=True, export_format="markdown")
|
|
|
|
|
|
class DoclingMarkdownParser(DoclingParser):
|
|
"""Docling-based Markdown parser."""
|
|
|
|
def __init__(self):
|
|
super().__init__(export_format="markdown")
|
|
|
|
|
|
class DoclingAsciiDocParser(DoclingParser):
|
|
"""Docling-based AsciiDoc parser."""
|
|
|
|
def __init__(self):
|
|
super().__init__(export_format="markdown")
|
|
|
|
|
|
class DoclingVTTParser(DoclingParser):
|
|
"""Docling-based WebVTT (video text tracks) parser."""
|
|
|
|
def __init__(self):
|
|
super().__init__(export_format="markdown")
|
|
|
|
|
|
class DoclingXMLParser(DoclingParser):
|
|
"""Docling-based XML parser (USPTO, JATS)."""
|
|
|
|
def __init__(self):
|
|
super().__init__(export_format="markdown")
|