feat: implement Docling parsers (#2202)

* feat: implement Docling parsers

* fix office

* docling-ocr-fix

* Docling smart ocr

* ruff fix

---------

Co-authored-by: Pavel <pabin@yandex.ru>
This commit is contained in:
Alex
2025-12-23 16:33:51 +00:00
committed by GitHub
parent 5b6cfa6ecc
commit ccd29b7d4e
7 changed files with 439 additions and 24 deletions

View File

@@ -48,7 +48,11 @@ FROM ubuntu:24.04 as final
RUN apt-get update && \
apt-get install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && apt-get install -y --no-install-recommends python3.12 && \
apt-get update && apt-get install -y --no-install-recommends \
python3.12 \
libgl1 \
libglib2.0-0 \
&& \
ln -s /usr/bin/python3.12 /usr/bin/python && \
rm -rf /var/lib/apt/lists/*

View File

@@ -76,7 +76,12 @@ class UploadFile(Resource):
temp_file_path = os.path.join(temp_dir, safe_file)
file.save(temp_file_path)
if zipfile.is_zipfile(temp_file_path):
# Only extract actual .zip files, not Office formats (.docx, .xlsx, .pptx)
# which are technically zip archives but should be processed as-is
is_office_format = safe_file.lower().endswith(
(".docx", ".xlsx", ".pptx", ".odt", ".ods", ".odp", ".epub")
)
if zipfile.is_zipfile(temp_file_path) and not is_office_format:
try:
with zipfile.ZipFile(temp_file_path, "r") as zip_ref:
zip_ref.extractall(path=temp_dir)

View File

@@ -42,6 +42,7 @@ class Settings(BaseSettings):
UPLOAD_FOLDER: str = "inputs"
PARSE_PDF_AS_IMAGE: bool = False
PARSE_IMAGE_REMOTE: bool = False
DOCLING_OCR_ENABLED: bool = True # Enable OCR for docling parsers (PDF, images)
VECTOR_STORE: str = (
"faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" or "pgvector"
)

View File

@@ -65,6 +65,10 @@ def embed_and_store_documents(docs: List[Any], folder_name: str, source_id: str,
if not os.path.exists(folder_name):
os.makedirs(folder_name)
# Validate docs is not empty
if not docs:
raise ValueError("No documents to embed - check file format and extension")
# Initialize vector store
if settings.VECTOR_STORE == "faiss":
docs_init = [docs.pop(0)]

View File

@@ -10,29 +10,94 @@ from application.parser.file.epub_parser import EpubParser
from application.parser.file.html_parser import HTMLParser
from application.parser.file.markdown_parser import MarkdownParser
from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
from application.parser.file.tabular_parser import PandasCSVParser, ExcelParser
from application.parser.file.json_parser import JSONParser
from application.parser.file.pptx_parser import PPTXParser
from application.parser.file.image_parser import ImageParser
from application.parser.schema.base import Document
from application.utils import num_tokens_from_string
from application.core.settings import settings
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".pdf": PDFParser(),
".docx": DocxParser(),
".csv": PandasCSVParser(),
".xlsx":ExcelParser(),
".epub": EpubParser(),
".md": MarkdownParser(),
".rst": RstParser(),
".html": HTMLParser(),
".mdx": MarkdownParser(),
".json":JSONParser(),
".pptx":PPTXParser(),
".png": ImageParser(),
".jpg": ImageParser(),
".jpeg": ImageParser(),
}
def get_default_file_extractor() -> Dict[str, BaseParser]:
"""Get the default file extractor.
Uses docling parsers by default for advanced document processing.
Falls back to standard parsers if docling is not installed.
"""
try:
from application.parser.file.docling_parser import (
DoclingPDFParser,
DoclingDocxParser,
DoclingPPTXParser,
DoclingXLSXParser,
DoclingHTMLParser,
DoclingImageParser,
DoclingCSVParser,
DoclingAsciiDocParser,
DoclingVTTParser,
DoclingXMLParser,
)
ocr_enabled = settings.DOCLING_OCR_ENABLED
return {
# Documents
".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled),
".docx": DoclingDocxParser(),
".pptx": DoclingPPTXParser(),
".xlsx": DoclingXLSXParser(),
# Web formats
".html": DoclingHTMLParser(),
".xhtml": DoclingHTMLParser(),
# Data formats
".csv": DoclingCSVParser(),
".json": JSONParser(), # Keep JSON parser (specialized handling)
# Text/markup formats
".md": MarkdownParser(), # Keep markdown parser (specialized handling)
".mdx": MarkdownParser(),
".rst": RstParser(),
".adoc": DoclingAsciiDocParser(),
".asciidoc": DoclingAsciiDocParser(),
# Images (with OCR)
".png": DoclingImageParser(ocr_enabled=ocr_enabled),
".jpg": DoclingImageParser(ocr_enabled=ocr_enabled),
".jpeg": DoclingImageParser(ocr_enabled=ocr_enabled),
".tiff": DoclingImageParser(ocr_enabled=ocr_enabled),
".tif": DoclingImageParser(ocr_enabled=ocr_enabled),
".bmp": DoclingImageParser(ocr_enabled=ocr_enabled),
".webp": DoclingImageParser(ocr_enabled=ocr_enabled),
# Media/subtitles
".vtt": DoclingVTTParser(),
# Specialized XML formats
".xml": DoclingXMLParser(),
# Formats docling doesn't support - use standard parsers
".epub": EpubParser(),
}
except ImportError:
logging.warning(
"docling is not installed. Using standard parsers. "
"For advanced document parsing, install with: pip install docling"
)
# Fallback to standard parsers
return {
".pdf": PDFParser(),
".docx": DocxParser(),
".csv": PandasCSVParser(),
".xlsx": ExcelParser(),
".epub": EpubParser(),
".md": MarkdownParser(),
".rst": RstParser(),
".html": HTMLParser(),
".mdx": MarkdownParser(),
".json": JSONParser(),
".pptx": PPTXParser(),
".png": ImageParser(),
".jpg": ImageParser(),
".jpeg": ImageParser(),
}
# For backwards compatibility
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = get_default_file_extractor()
class SimpleDirectoryReader(BaseReader):
@@ -83,7 +148,10 @@ class SimpleDirectoryReader(BaseReader):
self.recursive = recursive
self.exclude_hidden = exclude_hidden
self.required_exts = required_exts
# Normalize extensions to lowercase for case-insensitive matching
self.required_exts = (
[ext.lower() for ext in required_exts] if required_exts else None
)
self.num_files_limit = num_files_limit
if input_files:
@@ -112,7 +180,7 @@ class SimpleDirectoryReader(BaseReader):
continue
elif (
self.required_exts is not None
and input_file.suffix not in self.required_exts
and input_file.suffix.lower() not in self.required_exts
):
continue
else:
@@ -149,8 +217,9 @@ class SimpleDirectoryReader(BaseReader):
self.file_token_counts = {}
for input_file in self.input_files:
if input_file.suffix in self.file_extractor:
parser = self.file_extractor[input_file.suffix]
suffix_lower = input_file.suffix.lower()
if suffix_lower in self.file_extractor:
parser = self.file_extractor[suffix_lower]
if not parser.parser_config_set:
parser.init_parser()
data = parser.parse_file(input_file, errors=self.errors)
@@ -232,7 +301,7 @@ class SimpleDirectoryReader(BaseReader):
if subtree:
result[item.name] = subtree
else:
if self.required_exts is not None and item.suffix not in self.required_exts:
if self.required_exts is not None and item.suffix.lower() not in self.required_exts:
continue
full_path = str(item.resolve())

View File

@@ -0,0 +1,330 @@
"""Docling parser.
Uses docling library for advanced document parsing with layout detection,
table structure recognition, and unified document representation.
Supports: PDF, DOCX, PPTX, XLSX, HTML, XHTML, CSV, Markdown, AsciiDoc,
images (PNG, JPEG, TIFF, BMP, WEBP), WebVTT, and specialized XML formats.
"""
import importlib.util
import logging
from pathlib import Path
from typing import Dict, List, Optional, Union
from application.parser.file.base_parser import BaseParser
logger = logging.getLogger(__name__)
class DoclingParser(BaseParser):
"""Parser using docling for advanced document processing.
Docling provides:
- Advanced PDF layout analysis
- Table structure recognition
- Reading order detection
- OCR for scanned documents (supports RapidOCR)
- Unified DoclingDocument format
- Export to Markdown
Uses hybrid OCR approach by default:
- Text regions: Direct PDF text extraction (fast)
- Bitmap/image regions: OCR only these areas (smart)
"""
def __init__(
self,
ocr_enabled: bool = True,
table_structure: bool = True,
export_format: str = "markdown",
use_rapidocr: bool = True,
ocr_languages: Optional[List[str]] = None,
force_full_page_ocr: bool = False,
):
"""Initialize DoclingParser.
Args:
ocr_enabled: Enable OCR for bitmap/image regions in documents
table_structure: Enable table structure recognition
export_format: Output format ('markdown', 'text', 'html')
use_rapidocr: Use RapidOCR engine (default True, works well in Docker)
ocr_languages: List of OCR languages (default: ['english'])
force_full_page_ocr: Force OCR on entire page (False = smart hybrid OCR)
"""
super().__init__()
self.ocr_enabled = ocr_enabled
self.table_structure = table_structure
self.export_format = export_format
self.use_rapidocr = use_rapidocr
self.ocr_languages = ocr_languages or ["english"]
self.force_full_page_ocr = force_full_page_ocr
self._converter = None
def _create_converter(self):
"""Create a docling converter with hybrid OCR configuration.
Uses smart OCR approach:
- When ocr_enabled=True and force_full_page_ocr=False (default):
Layout model detects text vs bitmap regions, OCR only runs on bitmaps
- When ocr_enabled=True and force_full_page_ocr=True:
OCR runs on entire page (for scanned documents/images)
- When ocr_enabled=False:
No OCR, only native text extraction
Returns:
DocumentConverter instance
"""
from docling.document_converter import (
DocumentConverter,
ImageFormatOption,
InputFormat,
PdfFormatOption,
)
from docling.datamodel.pipeline_options import PdfPipelineOptions
pipeline_options = PdfPipelineOptions(
do_ocr=self.ocr_enabled,
do_table_structure=self.table_structure,
)
if self.ocr_enabled:
ocr_options = self._get_ocr_options()
if ocr_options is not None:
pipeline_options.ocr_options = ocr_options
return DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
),
InputFormat.IMAGE: ImageFormatOption(
pipeline_options=pipeline_options,
),
}
)
def _init_parser(self) -> Dict:
"""Initialize the docling converter with hybrid OCR."""
logger.info("Initializing DoclingParser...")
logger.info(f" ocr_enabled={self.ocr_enabled}")
logger.info(f" force_full_page_ocr={self.force_full_page_ocr}")
logger.info(f" use_rapidocr={self.use_rapidocr}")
if importlib.util.find_spec("docling.document_converter") is None:
raise ImportError(
"docling is required for DoclingParser. "
"Install it with: pip install docling"
)
# Create converter with hybrid OCR (smart: text direct, bitmaps OCR'd)
self._converter = self._create_converter()
logger.info("DoclingParser initialized successfully")
return {
"ocr_enabled": self.ocr_enabled,
"table_structure": self.table_structure,
"export_format": self.export_format,
"use_rapidocr": self.use_rapidocr,
"ocr_languages": self.ocr_languages,
"force_full_page_ocr": self.force_full_page_ocr,
}
def _get_ocr_options(self):
"""Get OCR options based on configuration.
Returns RapidOcrOptions if use_rapidocr is True and available,
otherwise returns None to use docling defaults.
"""
if not self.use_rapidocr:
return None
try:
from docling.datamodel.pipeline_options import RapidOcrOptions
return RapidOcrOptions(
lang=self.ocr_languages,
force_full_page_ocr=self.force_full_page_ocr,
)
except ImportError as e:
logger.warning(f"Failed to import RapidOcrOptions: {e}")
return None
except Exception as e:
logger.error(f"Error creating RapidOcrOptions: {e}")
return None
def _export_content(self, document) -> str:
"""Export document content in the configured format.
Handles edge case where text is nested under picture elements (e.g., OCR'd
images). If the standard export returns minimal content but document.texts
contains extracted text, falls back to direct text extraction.
"""
if self.export_format == "markdown":
content = document.export_to_markdown()
elif self.export_format == "html":
content = document.export_to_html()
else:
content = document.export_to_text()
# Handle case where text is nested under pictures (common with OCR'd images)
# Standard exports may return just "<!-- image -->" while actual text exists
stripped_content = content.strip()
is_minimal = len(stripped_content) < 50 or stripped_content == "<!-- image -->"
if is_minimal and hasattr(document, "texts") and document.texts:
# Extract text directly from document.texts
extracted_texts = [t.text for t in document.texts if t.text]
if extracted_texts:
logger.info(
f"Standard export minimal ({len(stripped_content)} chars), "
f"extracting {len(extracted_texts)} texts directly"
)
return "\n\n".join(extracted_texts)
return content
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
"""Parse file using docling with hybrid OCR.
Uses smart OCR approach where the layout model detects text vs bitmap
regions. Text is extracted directly, bitmaps are OCR'd only when needed.
Args:
file: Path to the file to parse
errors: Error handling mode (ignored, docling handles internally)
Returns:
Parsed document content as markdown string
"""
logger.info(f"parse_file called for: {file}")
if self._converter is None:
self._init_parser()
try:
logger.info(f"Converting file with hybrid OCR: {file}")
result = self._converter.convert(str(file))
content = self._export_content(result.document)
logger.info(f"Parse complete, content length: {len(content)} chars")
return content
except Exception as e:
logger.error(f"Error parsing file with docling: {e}", exc_info=True)
if errors == "ignore":
return f"[Error parsing file with docling: {str(e)}]"
raise
class DoclingPDFParser(DoclingParser):
"""Docling-based PDF parser with advanced features and RapidOCR support.
Uses hybrid OCR approach by default:
- Text regions: Direct PDF text extraction (fast)
- Bitmap/image regions: OCR only these areas (smart)
Set force_full_page_ocr=True only for fully scanned documents.
"""
def __init__(
self,
ocr_enabled: bool = True,
table_structure: bool = True,
use_rapidocr: bool = True,
ocr_languages: Optional[List[str]] = None,
force_full_page_ocr: bool = False,
):
super().__init__(
ocr_enabled=ocr_enabled,
table_structure=table_structure,
export_format="markdown",
use_rapidocr=use_rapidocr,
ocr_languages=ocr_languages,
force_full_page_ocr=force_full_page_ocr,
)
class DoclingDocxParser(DoclingParser):
"""Docling-based DOCX parser."""
def __init__(self):
super().__init__(export_format="markdown")
class DoclingPPTXParser(DoclingParser):
"""Docling-based PPTX parser."""
def __init__(self):
super().__init__(export_format="markdown")
class DoclingXLSXParser(DoclingParser):
"""Docling-based XLSX parser with table structure."""
def __init__(self):
super().__init__(table_structure=True, export_format="markdown")
class DoclingHTMLParser(DoclingParser):
"""Docling-based HTML parser."""
def __init__(self):
super().__init__(export_format="markdown")
class DoclingImageParser(DoclingParser):
"""Docling-based image parser with OCR and RapidOCR support.
For images, force_full_page_ocr=True is used since images are entirely
visual and require full OCR to extract any text.
"""
def __init__(
self,
ocr_enabled: bool = True,
use_rapidocr: bool = True,
ocr_languages: Optional[List[str]] = None,
force_full_page_ocr: bool = True,
):
super().__init__(
ocr_enabled=ocr_enabled,
export_format="markdown",
use_rapidocr=use_rapidocr,
ocr_languages=ocr_languages,
force_full_page_ocr=force_full_page_ocr,
)
class DoclingCSVParser(DoclingParser):
"""Docling-based CSV parser."""
def __init__(self):
super().__init__(table_structure=True, export_format="markdown")
class DoclingMarkdownParser(DoclingParser):
"""Docling-based Markdown parser."""
def __init__(self):
super().__init__(export_format="markdown")
class DoclingAsciiDocParser(DoclingParser):
"""Docling-based AsciiDoc parser."""
def __init__(self):
super().__init__(export_format="markdown")
class DoclingVTTParser(DoclingParser):
"""Docling-based WebVTT (video text tracks) parser."""
def __init__(self):
super().__init__(export_format="markdown")
class DoclingXMLParser(DoclingParser):
"""Docling-based XML parser (USPTO, JATS)."""
def __init__(self):
super().__init__(export_format="markdown")

View File

@@ -5,6 +5,8 @@ celery==5.6.0
cryptography==46.0.3
dataclasses-json==0.6.7
docling>=2.16.0
rapidocr>=1.4.0
onnxruntime>=1.19.0
docx2txt==0.8
duckduckgo-search==8.1.1
ebooklib==0.20