"""Comprehensive tests for application/parser/file/docs_parser.py Covers: PDFParser (init, parse with pypdf, parse as image, import error), DocxParser (init, parse, import error). """ from pathlib import Path from unittest.mock import MagicMock, patch, mock_open import pytest from application.parser.file.docs_parser import PDFParser, DocxParser # ===================================================================== # PDFParser - Init # ===================================================================== @pytest.mark.unit class TestPDFParserInit: def test_init_parser(self): parser = PDFParser() result = parser._init_parser() assert isinstance(result, dict) assert result == {} def test_parser_config_not_set_initially(self): parser = PDFParser() assert not parser.parser_config_set def test_parser_config_set_after_init(self): parser = PDFParser() parser.init_parser() assert parser.parser_config_set # ===================================================================== # PDFParser - Parse File # ===================================================================== @pytest.mark.unit class TestPDFParserParse: @patch("application.parser.file.docs_parser.settings") def test_parse_with_pypdf(self, mock_settings): mock_settings.PARSE_PDF_AS_IMAGE = False parser = PDFParser() mock_page1 = MagicMock() mock_page1.extract_text.return_value = "Page 1 content" mock_page2 = MagicMock() mock_page2.extract_text.return_value = "Page 2 content" mock_reader = MagicMock() mock_reader.pages = [mock_page1, mock_page2] with patch("application.parser.file.docs_parser.PdfReader", create=True), \ patch("builtins.open", mock_open()): # Need to patch the import inside the function import sys mock_pypdf = MagicMock() mock_pypdf.PdfReader = MagicMock(return_value=mock_reader) sys.modules["pypdf"] = mock_pypdf try: result = parser.parse_file(Path("test.pdf")) assert "Page 1 content" in result assert "Page 2 content" in result finally: del sys.modules["pypdf"] @patch("application.parser.file.docs_parser.settings") @patch("application.parser.file.docs_parser.requests") def test_parse_as_image(self, mock_requests, mock_settings): mock_settings.PARSE_PDF_AS_IMAGE = True mock_response = MagicMock() mock_response.json.return_value = {"markdown": "# OCR Result"} mock_requests.post.return_value = mock_response parser = PDFParser() with patch("builtins.open", mock_open(read_data=b"fake pdf")): result = parser.parse_file(Path("test.pdf")) assert result == "# OCR Result" @patch("application.parser.file.docs_parser.settings") def test_parse_raises_on_missing_pypdf(self, mock_settings): mock_settings.PARSE_PDF_AS_IMAGE = False parser = PDFParser() # Simulate the import error path original = parser.parse_file def mock_parse(*args, **kwargs): raise ValueError("pypdf is required to read PDF files.") parser.parse_file = mock_parse try: with pytest.raises(ValueError, match="pypdf is required"): parser.parse_file(Path("test.pdf")) finally: parser.parse_file = original # ===================================================================== # DocxParser - Init # ===================================================================== @pytest.mark.unit class TestDocxParserInit: def test_init_parser(self): parser = DocxParser() result = parser._init_parser() assert isinstance(result, dict) assert result == {} def test_parser_config_not_set_initially(self): parser = DocxParser() assert not parser.parser_config_set def test_parser_config_set_after_init(self): parser = DocxParser() parser.init_parser() assert parser.parser_config_set # ===================================================================== # DocxParser - Parse File # ===================================================================== @pytest.mark.unit class TestDocxParserParse: def test_parse_file_success(self): parser = DocxParser() import sys mock_docx2txt = MagicMock() mock_docx2txt.process.return_value = "DOCX content here" sys.modules["docx2txt"] = mock_docx2txt try: result = parser.parse_file(Path("test.docx")) assert result == "DOCX content here" finally: del sys.modules["docx2txt"] def test_parse_raises_on_missing_docx2txt(self): parser = DocxParser() original = parser.parse_file def mock_parse(*args, **kwargs): raise ValueError("docx2txt is required to read Microsoft Word files.") parser.parse_file = mock_parse try: with pytest.raises(ValueError, match="docx2txt is required"): parser.parse_file(Path("test.docx")) finally: parser.parse_file = original # ===================================================================== # BaseParser properties # ===================================================================== @pytest.mark.unit class TestBaseParserProperties: def test_get_file_metadata_default(self): parser = PDFParser() meta = parser.get_file_metadata(Path("test.pdf")) assert meta == {} # ===================================================================== # Coverage gap tests (lines 33-34, 59, 63) # ===================================================================== @pytest.mark.unit class TestDocsParserGaps: def test_pdf_parser_parse_as_image(self, tmp_path): """Cover lines 33-34: PARSE_PDF_AS_IMAGE sends to external service.""" from application.parser.file.docs_parser import PDFParser pdf_file = tmp_path / "test.pdf" pdf_file.write_bytes(b"%PDF-1.4 fake content") with patch( "application.parser.file.docs_parser.settings" ) as mock_settings: mock_settings.PARSE_PDF_AS_IMAGE = True with patch( "application.parser.file.docs_parser.requests.post" ) as mock_post: mock_post.return_value = MagicMock( json=MagicMock(return_value={"markdown": "# Parsed Content"}) ) parser = PDFParser() result = parser.parse_file(pdf_file) assert result == "# Parsed Content" mock_post.assert_called_once() def test_docx_parser_init_parser(self): """Cover line 59: DocxParser._init_parser returns empty dict.""" from application.parser.file.docs_parser import DocxParser parser = DocxParser() config = parser._init_parser() assert config == {} def test_docx_parser_import_error(self): """Cover line 63: ImportError when docx2txt not installed.""" from application.parser.file.docs_parser import DocxParser parser = DocxParser() with patch.dict("sys.modules", {"docx2txt": None}): with patch( "builtins.__import__", side_effect=ImportError("No module named 'docx2txt'"), ): with pytest.raises((ImportError, ValueError)): parser.parse_file(Path("/tmp/fake.docx"))