feat: implement Docling parsers (#2202)

* feat: implement Docling parsers * fix office * docling-ocr-fix * Docling smart ocr * ruff fix --------- Co-authored-by: Pavel <pabin@yandex.ru>
2026-02-03 04:44:10 +00:00 · 2025-12-23 16:33:51 +00:00
parent 5b6cfa6ecc
commit ccd29b7d4e
7 changed files with 439 additions and 24 deletions
--- a/application/Dockerfile
+++ b/application/Dockerfile
@@ -48,7 +48,11 @@ FROM ubuntu:24.04 as final
 RUN apt-get update && \
    apt-get install -y software-properties-common && \
    add-apt-repository ppa:deadsnakes/ppa && \
-    apt-get update && apt-get install -y --no-install-recommends python3.12 && \
+    apt-get update && apt-get install -y --no-install-recommends \
+        python3.12 \
+        libgl1 \
+        libglib2.0-0 \
+        && \
    ln -s /usr/bin/python3.12 /usr/bin/python && \
    rm -rf /var/lib/apt/lists/*

--- a/application/api/user/sources/upload.py
+++ b/application/api/user/sources/upload.py
@@ -76,7 +76,12 @@ class UploadFile(Resource):
                    temp_file_path = os.path.join(temp_dir, safe_file)
                    file.save(temp_file_path)

-                    if zipfile.is_zipfile(temp_file_path):
+                    # Only extract actual .zip files, not Office formats (.docx, .xlsx, .pptx)
+                    # which are technically zip archives but should be processed as-is
+                    is_office_format = safe_file.lower().endswith(
+                        (".docx", ".xlsx", ".pptx", ".odt", ".ods", ".odp", ".epub")
+                    )
+                    if zipfile.is_zipfile(temp_file_path) and not is_office_format:
                        try:
                            with zipfile.ZipFile(temp_file_path, "r") as zip_ref:
                                zip_ref.extractall(path=temp_dir)
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -42,6 +42,7 @@ class Settings(BaseSettings):
    UPLOAD_FOLDER: str = "inputs"
    PARSE_PDF_AS_IMAGE: bool = False
    PARSE_IMAGE_REMOTE: bool = False
+    DOCLING_OCR_ENABLED: bool = True  # Enable OCR for docling parsers (PDF, images)
    VECTOR_STORE: str = (
        "faiss"  #  "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" or "pgvector"
    )
--- a/application/parser/embedding_pipeline.py
+++ b/application/parser/embedding_pipeline.py
@@ -65,6 +65,10 @@ def embed_and_store_documents(docs: List[Any], folder_name: str, source_id: str,
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

+    # Validate docs is not empty
+    if not docs:
+        raise ValueError("No documents to embed - check file format and extension")
+
    # Initialize vector store
    if settings.VECTOR_STORE == "faiss":
        docs_init = [docs.pop(0)]
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -10,29 +10,94 @@ from application.parser.file.epub_parser import EpubParser
 from application.parser.file.html_parser import HTMLParser
 from application.parser.file.markdown_parser import MarkdownParser
 from application.parser.file.rst_parser import RstParser
-from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
+from application.parser.file.tabular_parser import PandasCSVParser, ExcelParser
 from application.parser.file.json_parser import JSONParser
 from application.parser.file.pptx_parser import PPTXParser
 from application.parser.file.image_parser import ImageParser
 from application.parser.schema.base import Document
 from application.utils import num_tokens_from_string
+from application.core.settings import settings

-DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
-    ".pdf": PDFParser(),
-    ".docx": DocxParser(),
-    ".csv": PandasCSVParser(),
-    ".xlsx":ExcelParser(),
-    ".epub": EpubParser(),
-    ".md": MarkdownParser(),
-    ".rst": RstParser(),
-    ".html": HTMLParser(),
-    ".mdx": MarkdownParser(),
-    ".json":JSONParser(),
-    ".pptx":PPTXParser(),
-    ".png": ImageParser(),
-    ".jpg": ImageParser(),
-    ".jpeg": ImageParser(),
-}
+
+def get_default_file_extractor() -> Dict[str, BaseParser]:
+    """Get the default file extractor.
+
+    Uses docling parsers by default for advanced document processing.
+    Falls back to standard parsers if docling is not installed.
+    """
+    try:
+        from application.parser.file.docling_parser import (
+            DoclingPDFParser,
+            DoclingDocxParser,
+            DoclingPPTXParser,
+            DoclingXLSXParser,
+            DoclingHTMLParser,
+            DoclingImageParser,
+            DoclingCSVParser,
+            DoclingAsciiDocParser,
+            DoclingVTTParser,
+            DoclingXMLParser,
+        )
+        ocr_enabled = settings.DOCLING_OCR_ENABLED
+        return {
+            # Documents
+            ".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled),
+            ".docx": DoclingDocxParser(),
+            ".pptx": DoclingPPTXParser(),
+            ".xlsx": DoclingXLSXParser(),
+            # Web formats
+            ".html": DoclingHTMLParser(),
+            ".xhtml": DoclingHTMLParser(),
+            # Data formats
+            ".csv": DoclingCSVParser(),
+            ".json": JSONParser(),  # Keep JSON parser (specialized handling)
+            # Text/markup formats
+            ".md": MarkdownParser(),  # Keep markdown parser (specialized handling)
+            ".mdx": MarkdownParser(),
+            ".rst": RstParser(),
+            ".adoc": DoclingAsciiDocParser(),
+            ".asciidoc": DoclingAsciiDocParser(),
+            # Images (with OCR)
+            ".png": DoclingImageParser(ocr_enabled=ocr_enabled),
+            ".jpg": DoclingImageParser(ocr_enabled=ocr_enabled),
+            ".jpeg": DoclingImageParser(ocr_enabled=ocr_enabled),
+            ".tiff": DoclingImageParser(ocr_enabled=ocr_enabled),
+            ".tif": DoclingImageParser(ocr_enabled=ocr_enabled),
+            ".bmp": DoclingImageParser(ocr_enabled=ocr_enabled),
+            ".webp": DoclingImageParser(ocr_enabled=ocr_enabled),
+            # Media/subtitles
+            ".vtt": DoclingVTTParser(),
+            # Specialized XML formats
+            ".xml": DoclingXMLParser(),
+            # Formats docling doesn't support - use standard parsers
+            ".epub": EpubParser(),
+        }
+    except ImportError:
+        logging.warning(
+            "docling is not installed. Using standard parsers. "
+            "For advanced document parsing, install with: pip install docling"
+        )
+        # Fallback to standard parsers
+        return {
+            ".pdf": PDFParser(),
+            ".docx": DocxParser(),
+            ".csv": PandasCSVParser(),
+            ".xlsx": ExcelParser(),
+            ".epub": EpubParser(),
+            ".md": MarkdownParser(),
+            ".rst": RstParser(),
+            ".html": HTMLParser(),
+            ".mdx": MarkdownParser(),
+            ".json": JSONParser(),
+            ".pptx": PPTXParser(),
+            ".png": ImageParser(),
+            ".jpg": ImageParser(),
+            ".jpeg": ImageParser(),
+        }
+
+
+# For backwards compatibility
+DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = get_default_file_extractor()


 class SimpleDirectoryReader(BaseReader):
@@ -83,7 +148,10 @@ class SimpleDirectoryReader(BaseReader):

        self.recursive = recursive
        self.exclude_hidden = exclude_hidden
-        self.required_exts = required_exts
+        # Normalize extensions to lowercase for case-insensitive matching
+        self.required_exts = (
+            [ext.lower() for ext in required_exts] if required_exts else None
+        )
        self.num_files_limit = num_files_limit

        if input_files:
@@ -112,7 +180,7 @@ class SimpleDirectoryReader(BaseReader):
                continue
            elif (
                    self.required_exts is not None
-                    and input_file.suffix not in self.required_exts
+                    and input_file.suffix.lower() not in self.required_exts
            ):
                continue
            else:
@@ -149,8 +217,9 @@ class SimpleDirectoryReader(BaseReader):
        self.file_token_counts = {}
        
        for input_file in self.input_files:
-            if input_file.suffix in self.file_extractor:
-                parser = self.file_extractor[input_file.suffix]
+            suffix_lower = input_file.suffix.lower()
+            if suffix_lower in self.file_extractor:
+                parser = self.file_extractor[suffix_lower]
                if not parser.parser_config_set:
                    parser.init_parser()
                data = parser.parse_file(input_file, errors=self.errors)
@@ -232,7 +301,7 @@ class SimpleDirectoryReader(BaseReader):
                    if subtree:
                        result[item.name] = subtree
                else:
-                    if self.required_exts is not None and item.suffix not in self.required_exts:
+                    if self.required_exts is not None and item.suffix.lower() not in self.required_exts:
                        continue
                    
                    full_path = str(item.resolve())
--- a/application/parser/file/docling_parser.py
+++ b/application/parser/file/docling_parser.py
@@ -0,0 +1,330 @@
+"""Docling parser.
+
+Uses docling library for advanced document parsing with layout detection,
+table structure recognition, and unified document representation.
+
+Supports: PDF, DOCX, PPTX, XLSX, HTML, XHTML, CSV, Markdown, AsciiDoc,
+images (PNG, JPEG, TIFF, BMP, WEBP), WebVTT, and specialized XML formats.
+"""
+import importlib.util
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+from application.parser.file.base_parser import BaseParser
+
+logger = logging.getLogger(__name__)
+
+
+class DoclingParser(BaseParser):
+    """Parser using docling for advanced document processing.
+
+    Docling provides:
+    - Advanced PDF layout analysis
+    - Table structure recognition
+    - Reading order detection
+    - OCR for scanned documents (supports RapidOCR)
+    - Unified DoclingDocument format
+    - Export to Markdown
+
+    Uses hybrid OCR approach by default:
+    - Text regions: Direct PDF text extraction (fast)
+    - Bitmap/image regions: OCR only these areas (smart)
+    """
+
+    def __init__(
+        self,
+        ocr_enabled: bool = True,
+        table_structure: bool = True,
+        export_format: str = "markdown",
+        use_rapidocr: bool = True,
+        ocr_languages: Optional[List[str]] = None,
+        force_full_page_ocr: bool = False,
+    ):
+        """Initialize DoclingParser.
+
+        Args:
+            ocr_enabled: Enable OCR for bitmap/image regions in documents
+            table_structure: Enable table structure recognition
+            export_format: Output format ('markdown', 'text', 'html')
+            use_rapidocr: Use RapidOCR engine (default True, works well in Docker)
+            ocr_languages: List of OCR languages (default: ['english'])
+            force_full_page_ocr: Force OCR on entire page (False = smart hybrid OCR)
+        """
+        super().__init__()
+        self.ocr_enabled = ocr_enabled
+        self.table_structure = table_structure
+        self.export_format = export_format
+        self.use_rapidocr = use_rapidocr
+        self.ocr_languages = ocr_languages or ["english"]
+        self.force_full_page_ocr = force_full_page_ocr
+        self._converter = None
+
+    def _create_converter(self):
+        """Create a docling converter with hybrid OCR configuration.
+
+        Uses smart OCR approach:
+        - When ocr_enabled=True and force_full_page_ocr=False (default):
+          Layout model detects text vs bitmap regions, OCR only runs on bitmaps
+        - When ocr_enabled=True and force_full_page_ocr=True:
+          OCR runs on entire page (for scanned documents/images)
+        - When ocr_enabled=False:
+          No OCR, only native text extraction
+
+        Returns:
+            DocumentConverter instance
+        """
+        from docling.document_converter import (
+            DocumentConverter,
+            ImageFormatOption,
+            InputFormat,
+            PdfFormatOption,
+        )
+        from docling.datamodel.pipeline_options import PdfPipelineOptions
+
+        pipeline_options = PdfPipelineOptions(
+            do_ocr=self.ocr_enabled,
+            do_table_structure=self.table_structure,
+        )
+
+        if self.ocr_enabled:
+            ocr_options = self._get_ocr_options()
+            if ocr_options is not None:
+                pipeline_options.ocr_options = ocr_options
+
+        return DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=pipeline_options,
+                ),
+                InputFormat.IMAGE: ImageFormatOption(
+                    pipeline_options=pipeline_options,
+                ),
+            }
+        )
+
+    def _init_parser(self) -> Dict:
+        """Initialize the docling converter with hybrid OCR."""
+        logger.info("Initializing DoclingParser...")
+        logger.info(f"  ocr_enabled={self.ocr_enabled}")
+        logger.info(f"  force_full_page_ocr={self.force_full_page_ocr}")
+        logger.info(f"  use_rapidocr={self.use_rapidocr}")
+
+        if importlib.util.find_spec("docling.document_converter") is None:
+            raise ImportError(
+                "docling is required for DoclingParser. "
+                "Install it with: pip install docling"
+            )
+
+        # Create converter with hybrid OCR (smart: text direct, bitmaps OCR'd)
+        self._converter = self._create_converter()
+
+        logger.info("DoclingParser initialized successfully")
+        return {
+            "ocr_enabled": self.ocr_enabled,
+            "table_structure": self.table_structure,
+            "export_format": self.export_format,
+            "use_rapidocr": self.use_rapidocr,
+            "ocr_languages": self.ocr_languages,
+            "force_full_page_ocr": self.force_full_page_ocr,
+        }
+
+    def _get_ocr_options(self):
+        """Get OCR options based on configuration.
+
+        Returns RapidOcrOptions if use_rapidocr is True and available,
+        otherwise returns None to use docling defaults.
+        """
+        if not self.use_rapidocr:
+            return None
+
+        try:
+            from docling.datamodel.pipeline_options import RapidOcrOptions
+
+            return RapidOcrOptions(
+                lang=self.ocr_languages,
+                force_full_page_ocr=self.force_full_page_ocr,
+            )
+        except ImportError as e:
+            logger.warning(f"Failed to import RapidOcrOptions: {e}")
+            return None
+        except Exception as e:
+            logger.error(f"Error creating RapidOcrOptions: {e}")
+            return None
+
+    def _export_content(self, document) -> str:
+        """Export document content in the configured format.
+
+        Handles edge case where text is nested under picture elements (e.g., OCR'd
+        images). If the standard export returns minimal content but document.texts
+        contains extracted text, falls back to direct text extraction.
+        """
+        if self.export_format == "markdown":
+            content = document.export_to_markdown()
+        elif self.export_format == "html":
+            content = document.export_to_html()
+        else:
+            content = document.export_to_text()
+
+        # Handle case where text is nested under pictures (common with OCR'd images)
+        # Standard exports may return just "<!-- image -->" while actual text exists
+        stripped_content = content.strip()
+        is_minimal = len(stripped_content) < 50 or stripped_content == "<!-- image -->"
+
+        if is_minimal and hasattr(document, "texts") and document.texts:
+            # Extract text directly from document.texts
+            extracted_texts = [t.text for t in document.texts if t.text]
+            if extracted_texts:
+                logger.info(
+                    f"Standard export minimal ({len(stripped_content)} chars), "
+                    f"extracting {len(extracted_texts)} texts directly"
+                )
+                return "\n\n".join(extracted_texts)
+
+        return content
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file using docling with hybrid OCR.
+
+        Uses smart OCR approach where the layout model detects text vs bitmap
+        regions. Text is extracted directly, bitmaps are OCR'd only when needed.
+
+        Args:
+            file: Path to the file to parse
+            errors: Error handling mode (ignored, docling handles internally)
+
+        Returns:
+            Parsed document content as markdown string
+        """
+        logger.info(f"parse_file called for: {file}")
+
+        if self._converter is None:
+            self._init_parser()
+
+        try:
+            logger.info(f"Converting file with hybrid OCR: {file}")
+            result = self._converter.convert(str(file))
+            content = self._export_content(result.document)
+            logger.info(f"Parse complete, content length: {len(content)} chars")
+
+            return content
+
+        except Exception as e:
+            logger.error(f"Error parsing file with docling: {e}", exc_info=True)
+            if errors == "ignore":
+                return f"[Error parsing file with docling: {str(e)}]"
+            raise
+
+
+class DoclingPDFParser(DoclingParser):
+    """Docling-based PDF parser with advanced features and RapidOCR support.
+
+    Uses hybrid OCR approach by default:
+    - Text regions: Direct PDF text extraction (fast)
+    - Bitmap/image regions: OCR only these areas (smart)
+
+    Set force_full_page_ocr=True only for fully scanned documents.
+    """
+
+    def __init__(
+        self,
+        ocr_enabled: bool = True,
+        table_structure: bool = True,
+        use_rapidocr: bool = True,
+        ocr_languages: Optional[List[str]] = None,
+        force_full_page_ocr: bool = False,
+    ):
+        super().__init__(
+            ocr_enabled=ocr_enabled,
+            table_structure=table_structure,
+            export_format="markdown",
+            use_rapidocr=use_rapidocr,
+            ocr_languages=ocr_languages,
+            force_full_page_ocr=force_full_page_ocr,
+        )
+
+
+class DoclingDocxParser(DoclingParser):
+    """Docling-based DOCX parser."""
+
+    def __init__(self):
+        super().__init__(export_format="markdown")
+
+
+class DoclingPPTXParser(DoclingParser):
+    """Docling-based PPTX parser."""
+
+    def __init__(self):
+        super().__init__(export_format="markdown")
+
+
+class DoclingXLSXParser(DoclingParser):
+    """Docling-based XLSX parser with table structure."""
+
+    def __init__(self):
+        super().__init__(table_structure=True, export_format="markdown")
+
+
+class DoclingHTMLParser(DoclingParser):
+    """Docling-based HTML parser."""
+
+    def __init__(self):
+        super().__init__(export_format="markdown")
+
+
+class DoclingImageParser(DoclingParser):
+    """Docling-based image parser with OCR and RapidOCR support.
+
+    For images, force_full_page_ocr=True is used since images are entirely
+    visual and require full OCR to extract any text.
+    """
+
+    def __init__(
+        self,
+        ocr_enabled: bool = True,
+        use_rapidocr: bool = True,
+        ocr_languages: Optional[List[str]] = None,
+        force_full_page_ocr: bool = True,
+    ):
+        super().__init__(
+            ocr_enabled=ocr_enabled,
+            export_format="markdown",
+            use_rapidocr=use_rapidocr,
+            ocr_languages=ocr_languages,
+            force_full_page_ocr=force_full_page_ocr,
+        )
+
+
+class DoclingCSVParser(DoclingParser):
+    """Docling-based CSV parser."""
+
+    def __init__(self):
+        super().__init__(table_structure=True, export_format="markdown")
+
+
+class DoclingMarkdownParser(DoclingParser):
+    """Docling-based Markdown parser."""
+
+    def __init__(self):
+        super().__init__(export_format="markdown")
+
+
+class DoclingAsciiDocParser(DoclingParser):
+    """Docling-based AsciiDoc parser."""
+
+    def __init__(self):
+        super().__init__(export_format="markdown")
+
+
+class DoclingVTTParser(DoclingParser):
+    """Docling-based WebVTT (video text tracks) parser."""
+
+    def __init__(self):
+        super().__init__(export_format="markdown")
+
+
+class DoclingXMLParser(DoclingParser):
+    """Docling-based XML parser (USPTO, JATS)."""
+
+    def __init__(self):
+        super().__init__(export_format="markdown")
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -5,6 +5,8 @@ celery==5.6.0
 cryptography==46.0.3
 dataclasses-json==0.6.7
 docling>=2.16.0
+rapidocr>=1.4.0
+onnxruntime>=1.19.0
 docx2txt==0.8
 duckduckgo-search==8.1.1
 ebooklib==0.20