Files
DocsGPT/tests/parser/file/test_docs_parser.py
2026-03-30 16:13:08 +01:00

188 lines
5.6 KiB
Python

"""Comprehensive tests for application/parser/file/docs_parser.py
Covers: PDFParser (init, parse with pypdf, parse as image, import error),
DocxParser (init, parse, import error).
"""
from pathlib import Path
from unittest.mock import MagicMock, patch, mock_open
import pytest
from application.parser.file.docs_parser import PDFParser, DocxParser
# =====================================================================
# PDFParser - Init
# =====================================================================
@pytest.mark.unit
class TestPDFParserInit:
def test_init_parser(self):
parser = PDFParser()
result = parser._init_parser()
assert isinstance(result, dict)
assert result == {}
def test_parser_config_not_set_initially(self):
parser = PDFParser()
assert not parser.parser_config_set
def test_parser_config_set_after_init(self):
parser = PDFParser()
parser.init_parser()
assert parser.parser_config_set
# =====================================================================
# PDFParser - Parse File
# =====================================================================
@pytest.mark.unit
class TestPDFParserParse:
@patch("application.parser.file.docs_parser.settings")
def test_parse_with_pypdf(self, mock_settings):
mock_settings.PARSE_PDF_AS_IMAGE = False
parser = PDFParser()
mock_page1 = MagicMock()
mock_page1.extract_text.return_value = "Page 1 content"
mock_page2 = MagicMock()
mock_page2.extract_text.return_value = "Page 2 content"
mock_reader = MagicMock()
mock_reader.pages = [mock_page1, mock_page2]
with patch("application.parser.file.docs_parser.PdfReader",
create=True), \
patch("builtins.open", mock_open()):
# Need to patch the import inside the function
import sys
mock_pypdf = MagicMock()
mock_pypdf.PdfReader = MagicMock(return_value=mock_reader)
sys.modules["pypdf"] = mock_pypdf
try:
result = parser.parse_file(Path("test.pdf"))
assert "Page 1 content" in result
assert "Page 2 content" in result
finally:
del sys.modules["pypdf"]
@patch("application.parser.file.docs_parser.settings")
@patch("application.parser.file.docs_parser.requests")
def test_parse_as_image(self, mock_requests, mock_settings):
mock_settings.PARSE_PDF_AS_IMAGE = True
mock_response = MagicMock()
mock_response.json.return_value = {"markdown": "# OCR Result"}
mock_requests.post.return_value = mock_response
parser = PDFParser()
with patch("builtins.open", mock_open(read_data=b"fake pdf")):
result = parser.parse_file(Path("test.pdf"))
assert result == "# OCR Result"
@patch("application.parser.file.docs_parser.settings")
def test_parse_raises_on_missing_pypdf(self, mock_settings):
mock_settings.PARSE_PDF_AS_IMAGE = False
parser = PDFParser()
# Simulate the import error path
original = parser.parse_file
def mock_parse(*args, **kwargs):
raise ValueError("pypdf is required to read PDF files.")
parser.parse_file = mock_parse
try:
with pytest.raises(ValueError, match="pypdf is required"):
parser.parse_file(Path("test.pdf"))
finally:
parser.parse_file = original
# =====================================================================
# DocxParser - Init
# =====================================================================
@pytest.mark.unit
class TestDocxParserInit:
def test_init_parser(self):
parser = DocxParser()
result = parser._init_parser()
assert isinstance(result, dict)
assert result == {}
def test_parser_config_not_set_initially(self):
parser = DocxParser()
assert not parser.parser_config_set
def test_parser_config_set_after_init(self):
parser = DocxParser()
parser.init_parser()
assert parser.parser_config_set
# =====================================================================
# DocxParser - Parse File
# =====================================================================
@pytest.mark.unit
class TestDocxParserParse:
def test_parse_file_success(self):
parser = DocxParser()
import sys
mock_docx2txt = MagicMock()
mock_docx2txt.process.return_value = "DOCX content here"
sys.modules["docx2txt"] = mock_docx2txt
try:
result = parser.parse_file(Path("test.docx"))
assert result == "DOCX content here"
finally:
del sys.modules["docx2txt"]
def test_parse_raises_on_missing_docx2txt(self):
parser = DocxParser()
original = parser.parse_file
def mock_parse(*args, **kwargs):
raise ValueError("docx2txt is required to read Microsoft Word files.")
parser.parse_file = mock_parse
try:
with pytest.raises(ValueError, match="docx2txt is required"):
parser.parse_file(Path("test.docx"))
finally:
parser.parse_file = original
# =====================================================================
# BaseParser properties
# =====================================================================
@pytest.mark.unit
class TestBaseParserProperties:
def test_get_file_metadata_default(self):
parser = PDFParser()
meta = parser.get_file_metadata(Path("test.pdf"))
assert meta == {}