diff --git a/application/Dockerfile b/application/Dockerfile index e33721a2..31904c5a 100644 --- a/application/Dockerfile +++ b/application/Dockerfile @@ -48,7 +48,11 @@ FROM ubuntu:24.04 as final RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ - apt-get update && apt-get install -y --no-install-recommends python3.12 && \ + apt-get update && apt-get install -y --no-install-recommends \ + python3.12 \ + libgl1 \ + libglib2.0-0 \ + && \ ln -s /usr/bin/python3.12 /usr/bin/python && \ rm -rf /var/lib/apt/lists/* diff --git a/application/api/user/sources/upload.py b/application/api/user/sources/upload.py index 7519f7b5..6c163da4 100644 --- a/application/api/user/sources/upload.py +++ b/application/api/user/sources/upload.py @@ -76,7 +76,12 @@ class UploadFile(Resource): temp_file_path = os.path.join(temp_dir, safe_file) file.save(temp_file_path) - if zipfile.is_zipfile(temp_file_path): + # Only extract actual .zip files, not Office formats (.docx, .xlsx, .pptx) + # which are technically zip archives but should be processed as-is + is_office_format = safe_file.lower().endswith( + (".docx", ".xlsx", ".pptx", ".odt", ".ods", ".odp", ".epub") + ) + if zipfile.is_zipfile(temp_file_path) and not is_office_format: try: with zipfile.ZipFile(temp_file_path, "r") as zip_ref: zip_ref.extractall(path=temp_dir) diff --git a/application/core/settings.py b/application/core/settings.py index 0f49d8fe..d8bb6ff1 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -42,6 +42,7 @@ class Settings(BaseSettings): UPLOAD_FOLDER: str = "inputs" PARSE_PDF_AS_IMAGE: bool = False PARSE_IMAGE_REMOTE: bool = False + DOCLING_OCR_ENABLED: bool = True # Enable OCR for docling parsers (PDF, images) VECTOR_STORE: str = ( "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" or "pgvector" ) diff --git a/application/parser/embedding_pipeline.py b/application/parser/embedding_pipeline.py index a777b469..80bf2786 100755 --- a/application/parser/embedding_pipeline.py +++ b/application/parser/embedding_pipeline.py @@ -65,6 +65,10 @@ def embed_and_store_documents(docs: List[Any], folder_name: str, source_id: str, if not os.path.exists(folder_name): os.makedirs(folder_name) + # Validate docs is not empty + if not docs: + raise ValueError("No documents to embed - check file format and extension") + # Initialize vector store if settings.VECTOR_STORE == "faiss": docs_init = [docs.pop(0)] diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index c8f2234a..dc12a4dd 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -10,29 +10,94 @@ from application.parser.file.epub_parser import EpubParser from application.parser.file.html_parser import HTMLParser from application.parser.file.markdown_parser import MarkdownParser from application.parser.file.rst_parser import RstParser -from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser +from application.parser.file.tabular_parser import PandasCSVParser, ExcelParser from application.parser.file.json_parser import JSONParser from application.parser.file.pptx_parser import PPTXParser from application.parser.file.image_parser import ImageParser from application.parser.schema.base import Document from application.utils import num_tokens_from_string +from application.core.settings import settings -DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { - ".pdf": PDFParser(), - ".docx": DocxParser(), - ".csv": PandasCSVParser(), - ".xlsx":ExcelParser(), - ".epub": EpubParser(), - ".md": MarkdownParser(), - ".rst": RstParser(), - ".html": HTMLParser(), - ".mdx": MarkdownParser(), - ".json":JSONParser(), - ".pptx":PPTXParser(), - ".png": ImageParser(), - ".jpg": ImageParser(), - ".jpeg": ImageParser(), -} + +def get_default_file_extractor() -> Dict[str, BaseParser]: + """Get the default file extractor. + + Uses docling parsers by default for advanced document processing. + Falls back to standard parsers if docling is not installed. + """ + try: + from application.parser.file.docling_parser import ( + DoclingPDFParser, + DoclingDocxParser, + DoclingPPTXParser, + DoclingXLSXParser, + DoclingHTMLParser, + DoclingImageParser, + DoclingCSVParser, + DoclingAsciiDocParser, + DoclingVTTParser, + DoclingXMLParser, + ) + ocr_enabled = settings.DOCLING_OCR_ENABLED + return { + # Documents + ".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled), + ".docx": DoclingDocxParser(), + ".pptx": DoclingPPTXParser(), + ".xlsx": DoclingXLSXParser(), + # Web formats + ".html": DoclingHTMLParser(), + ".xhtml": DoclingHTMLParser(), + # Data formats + ".csv": DoclingCSVParser(), + ".json": JSONParser(), # Keep JSON parser (specialized handling) + # Text/markup formats + ".md": MarkdownParser(), # Keep markdown parser (specialized handling) + ".mdx": MarkdownParser(), + ".rst": RstParser(), + ".adoc": DoclingAsciiDocParser(), + ".asciidoc": DoclingAsciiDocParser(), + # Images (with OCR) + ".png": DoclingImageParser(ocr_enabled=ocr_enabled), + ".jpg": DoclingImageParser(ocr_enabled=ocr_enabled), + ".jpeg": DoclingImageParser(ocr_enabled=ocr_enabled), + ".tiff": DoclingImageParser(ocr_enabled=ocr_enabled), + ".tif": DoclingImageParser(ocr_enabled=ocr_enabled), + ".bmp": DoclingImageParser(ocr_enabled=ocr_enabled), + ".webp": DoclingImageParser(ocr_enabled=ocr_enabled), + # Media/subtitles + ".vtt": DoclingVTTParser(), + # Specialized XML formats + ".xml": DoclingXMLParser(), + # Formats docling doesn't support - use standard parsers + ".epub": EpubParser(), + } + except ImportError: + logging.warning( + "docling is not installed. Using standard parsers. " + "For advanced document parsing, install with: pip install docling" + ) + # Fallback to standard parsers + return { + ".pdf": PDFParser(), + ".docx": DocxParser(), + ".csv": PandasCSVParser(), + ".xlsx": ExcelParser(), + ".epub": EpubParser(), + ".md": MarkdownParser(), + ".rst": RstParser(), + ".html": HTMLParser(), + ".mdx": MarkdownParser(), + ".json": JSONParser(), + ".pptx": PPTXParser(), + ".png": ImageParser(), + ".jpg": ImageParser(), + ".jpeg": ImageParser(), + } + + +# For backwards compatibility +DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = get_default_file_extractor() class SimpleDirectoryReader(BaseReader): @@ -83,7 +148,10 @@ class SimpleDirectoryReader(BaseReader): self.recursive = recursive self.exclude_hidden = exclude_hidden - self.required_exts = required_exts + # Normalize extensions to lowercase for case-insensitive matching + self.required_exts = ( + [ext.lower() for ext in required_exts] if required_exts else None + ) self.num_files_limit = num_files_limit if input_files: @@ -112,7 +180,7 @@ class SimpleDirectoryReader(BaseReader): continue elif ( self.required_exts is not None - and input_file.suffix not in self.required_exts + and input_file.suffix.lower() not in self.required_exts ): continue else: @@ -149,8 +217,9 @@ class SimpleDirectoryReader(BaseReader): self.file_token_counts = {} for input_file in self.input_files: - if input_file.suffix in self.file_extractor: - parser = self.file_extractor[input_file.suffix] + suffix_lower = input_file.suffix.lower() + if suffix_lower in self.file_extractor: + parser = self.file_extractor[suffix_lower] if not parser.parser_config_set: parser.init_parser() data = parser.parse_file(input_file, errors=self.errors) @@ -232,7 +301,7 @@ class SimpleDirectoryReader(BaseReader): if subtree: result[item.name] = subtree else: - if self.required_exts is not None and item.suffix not in self.required_exts: + if self.required_exts is not None and item.suffix.lower() not in self.required_exts: continue full_path = str(item.resolve()) diff --git a/application/parser/file/docling_parser.py b/application/parser/file/docling_parser.py new file mode 100644 index 00000000..a12431d1 --- /dev/null +++ b/application/parser/file/docling_parser.py @@ -0,0 +1,330 @@ +"""Docling parser. + +Uses docling library for advanced document parsing with layout detection, +table structure recognition, and unified document representation. + +Supports: PDF, DOCX, PPTX, XLSX, HTML, XHTML, CSV, Markdown, AsciiDoc, +images (PNG, JPEG, TIFF, BMP, WEBP), WebVTT, and specialized XML formats. +""" +import importlib.util +import logging +from pathlib import Path +from typing import Dict, List, Optional, Union + +from application.parser.file.base_parser import BaseParser + +logger = logging.getLogger(__name__) + + +class DoclingParser(BaseParser): + """Parser using docling for advanced document processing. + + Docling provides: + - Advanced PDF layout analysis + - Table structure recognition + - Reading order detection + - OCR for scanned documents (supports RapidOCR) + - Unified DoclingDocument format + - Export to Markdown + + Uses hybrid OCR approach by default: + - Text regions: Direct PDF text extraction (fast) + - Bitmap/image regions: OCR only these areas (smart) + """ + + def __init__( + self, + ocr_enabled: bool = True, + table_structure: bool = True, + export_format: str = "markdown", + use_rapidocr: bool = True, + ocr_languages: Optional[List[str]] = None, + force_full_page_ocr: bool = False, + ): + """Initialize DoclingParser. + + Args: + ocr_enabled: Enable OCR for bitmap/image regions in documents + table_structure: Enable table structure recognition + export_format: Output format ('markdown', 'text', 'html') + use_rapidocr: Use RapidOCR engine (default True, works well in Docker) + ocr_languages: List of OCR languages (default: ['english']) + force_full_page_ocr: Force OCR on entire page (False = smart hybrid OCR) + """ + super().__init__() + self.ocr_enabled = ocr_enabled + self.table_structure = table_structure + self.export_format = export_format + self.use_rapidocr = use_rapidocr + self.ocr_languages = ocr_languages or ["english"] + self.force_full_page_ocr = force_full_page_ocr + self._converter = None + + def _create_converter(self): + """Create a docling converter with hybrid OCR configuration. + + Uses smart OCR approach: + - When ocr_enabled=True and force_full_page_ocr=False (default): + Layout model detects text vs bitmap regions, OCR only runs on bitmaps + - When ocr_enabled=True and force_full_page_ocr=True: + OCR runs on entire page (for scanned documents/images) + - When ocr_enabled=False: + No OCR, only native text extraction + + Returns: + DocumentConverter instance + """ + from docling.document_converter import ( + DocumentConverter, + ImageFormatOption, + InputFormat, + PdfFormatOption, + ) + from docling.datamodel.pipeline_options import PdfPipelineOptions + + pipeline_options = PdfPipelineOptions( + do_ocr=self.ocr_enabled, + do_table_structure=self.table_structure, + ) + + if self.ocr_enabled: + ocr_options = self._get_ocr_options() + if ocr_options is not None: + pipeline_options.ocr_options = ocr_options + + return DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ), + InputFormat.IMAGE: ImageFormatOption( + pipeline_options=pipeline_options, + ), + } + ) + + def _init_parser(self) -> Dict: + """Initialize the docling converter with hybrid OCR.""" + logger.info("Initializing DoclingParser...") + logger.info(f" ocr_enabled={self.ocr_enabled}") + logger.info(f" force_full_page_ocr={self.force_full_page_ocr}") + logger.info(f" use_rapidocr={self.use_rapidocr}") + + if importlib.util.find_spec("docling.document_converter") is None: + raise ImportError( + "docling is required for DoclingParser. " + "Install it with: pip install docling" + ) + + # Create converter with hybrid OCR (smart: text direct, bitmaps OCR'd) + self._converter = self._create_converter() + + logger.info("DoclingParser initialized successfully") + return { + "ocr_enabled": self.ocr_enabled, + "table_structure": self.table_structure, + "export_format": self.export_format, + "use_rapidocr": self.use_rapidocr, + "ocr_languages": self.ocr_languages, + "force_full_page_ocr": self.force_full_page_ocr, + } + + def _get_ocr_options(self): + """Get OCR options based on configuration. + + Returns RapidOcrOptions if use_rapidocr is True and available, + otherwise returns None to use docling defaults. + """ + if not self.use_rapidocr: + return None + + try: + from docling.datamodel.pipeline_options import RapidOcrOptions + + return RapidOcrOptions( + lang=self.ocr_languages, + force_full_page_ocr=self.force_full_page_ocr, + ) + except ImportError as e: + logger.warning(f"Failed to import RapidOcrOptions: {e}") + return None + except Exception as e: + logger.error(f"Error creating RapidOcrOptions: {e}") + return None + + def _export_content(self, document) -> str: + """Export document content in the configured format. + + Handles edge case where text is nested under picture elements (e.g., OCR'd + images). If the standard export returns minimal content but document.texts + contains extracted text, falls back to direct text extraction. + """ + if self.export_format == "markdown": + content = document.export_to_markdown() + elif self.export_format == "html": + content = document.export_to_html() + else: + content = document.export_to_text() + + # Handle case where text is nested under pictures (common with OCR'd images) + # Standard exports may return just "" while actual text exists + stripped_content = content.strip() + is_minimal = len(stripped_content) < 50 or stripped_content == "" + + if is_minimal and hasattr(document, "texts") and document.texts: + # Extract text directly from document.texts + extracted_texts = [t.text for t in document.texts if t.text] + if extracted_texts: + logger.info( + f"Standard export minimal ({len(stripped_content)} chars), " + f"extracting {len(extracted_texts)} texts directly" + ) + return "\n\n".join(extracted_texts) + + return content + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + """Parse file using docling with hybrid OCR. + + Uses smart OCR approach where the layout model detects text vs bitmap + regions. Text is extracted directly, bitmaps are OCR'd only when needed. + + Args: + file: Path to the file to parse + errors: Error handling mode (ignored, docling handles internally) + + Returns: + Parsed document content as markdown string + """ + logger.info(f"parse_file called for: {file}") + + if self._converter is None: + self._init_parser() + + try: + logger.info(f"Converting file with hybrid OCR: {file}") + result = self._converter.convert(str(file)) + content = self._export_content(result.document) + logger.info(f"Parse complete, content length: {len(content)} chars") + + return content + + except Exception as e: + logger.error(f"Error parsing file with docling: {e}", exc_info=True) + if errors == "ignore": + return f"[Error parsing file with docling: {str(e)}]" + raise + + +class DoclingPDFParser(DoclingParser): + """Docling-based PDF parser with advanced features and RapidOCR support. + + Uses hybrid OCR approach by default: + - Text regions: Direct PDF text extraction (fast) + - Bitmap/image regions: OCR only these areas (smart) + + Set force_full_page_ocr=True only for fully scanned documents. + """ + + def __init__( + self, + ocr_enabled: bool = True, + table_structure: bool = True, + use_rapidocr: bool = True, + ocr_languages: Optional[List[str]] = None, + force_full_page_ocr: bool = False, + ): + super().__init__( + ocr_enabled=ocr_enabled, + table_structure=table_structure, + export_format="markdown", + use_rapidocr=use_rapidocr, + ocr_languages=ocr_languages, + force_full_page_ocr=force_full_page_ocr, + ) + + +class DoclingDocxParser(DoclingParser): + """Docling-based DOCX parser.""" + + def __init__(self): + super().__init__(export_format="markdown") + + +class DoclingPPTXParser(DoclingParser): + """Docling-based PPTX parser.""" + + def __init__(self): + super().__init__(export_format="markdown") + + +class DoclingXLSXParser(DoclingParser): + """Docling-based XLSX parser with table structure.""" + + def __init__(self): + super().__init__(table_structure=True, export_format="markdown") + + +class DoclingHTMLParser(DoclingParser): + """Docling-based HTML parser.""" + + def __init__(self): + super().__init__(export_format="markdown") + + +class DoclingImageParser(DoclingParser): + """Docling-based image parser with OCR and RapidOCR support. + + For images, force_full_page_ocr=True is used since images are entirely + visual and require full OCR to extract any text. + """ + + def __init__( + self, + ocr_enabled: bool = True, + use_rapidocr: bool = True, + ocr_languages: Optional[List[str]] = None, + force_full_page_ocr: bool = True, + ): + super().__init__( + ocr_enabled=ocr_enabled, + export_format="markdown", + use_rapidocr=use_rapidocr, + ocr_languages=ocr_languages, + force_full_page_ocr=force_full_page_ocr, + ) + + +class DoclingCSVParser(DoclingParser): + """Docling-based CSV parser.""" + + def __init__(self): + super().__init__(table_structure=True, export_format="markdown") + + +class DoclingMarkdownParser(DoclingParser): + """Docling-based Markdown parser.""" + + def __init__(self): + super().__init__(export_format="markdown") + + +class DoclingAsciiDocParser(DoclingParser): + """Docling-based AsciiDoc parser.""" + + def __init__(self): + super().__init__(export_format="markdown") + + +class DoclingVTTParser(DoclingParser): + """Docling-based WebVTT (video text tracks) parser.""" + + def __init__(self): + super().__init__(export_format="markdown") + + +class DoclingXMLParser(DoclingParser): + """Docling-based XML parser (USPTO, JATS).""" + + def __init__(self): + super().__init__(export_format="markdown") diff --git a/application/requirements.txt b/application/requirements.txt index c88cabaf..89e8e042 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -5,6 +5,8 @@ celery==5.6.0 cryptography==46.0.3 dataclasses-json==0.6.7 docling>=2.16.0 +rapidocr>=1.4.0 +onnxruntime>=1.19.0 docx2txt==0.8 duckduckgo-search==8.1.1 ebooklib==0.20