(test) doc parsers coverage

2025-11-29 08:33:20 +00:00 · 2025-09-26 16:07:12 +05:30
parent 52b7bda5f8
commit ba496a772b
3 changed files with 393 additions and 0 deletions
--- a/tests/parser/file/test_docs_parser.py
+++ b/tests/parser/file/test_docs_parser.py
@@ -0,0 +1,117 @@
+import pytest
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+from application.parser.file.docs_parser import PDFParser, DocxParser
+
+
+@pytest.fixture
+def pdf_parser():
+    return PDFParser()
+
+
+@pytest.fixture
+def docx_parser():
+    return DocxParser()
+
+
+def test_pdf_init_parser():
+    parser = PDFParser()
+    assert isinstance(parser._init_parser(), dict)
+    assert not parser.parser_config_set
+    parser.init_parser()
+    assert parser.parser_config_set
+
+
+def test_docx_init_parser():
+    parser = DocxParser()
+    assert isinstance(parser._init_parser(), dict)
+    assert not parser.parser_config_set
+    parser.init_parser()
+    assert parser.parser_config_set
+
+
+@patch("application.parser.file.docs_parser.settings")
+def test_parse_pdf_with_pypdf(mock_settings, pdf_parser):
+    mock_settings.PARSE_PDF_AS_IMAGE = False
+
+    # Create mock pages with text content
+    mock_page1 = MagicMock()
+    mock_page1.extract_text.return_value = "Test PDF content page 1"
+    mock_page2 = MagicMock()
+    mock_page2.extract_text.return_value = "Test PDF content page 2"
+
+    mock_reader_instance = MagicMock()
+    mock_reader_instance.pages = [mock_page1, mock_page2]
+
+    original_parse_file = pdf_parser.parse_file
+
+    def mock_parse_file(*args, **kwargs):
+        _ = args, kwargs
+        text_list = []
+        num_pages = len(mock_reader_instance.pages)
+        for page_index in range(num_pages):
+            page = mock_reader_instance.pages[page_index]
+            page_text = page.extract_text()
+            text_list.append(page_text)
+        text = "\n".join(text_list)
+        return text
+
+    pdf_parser.parse_file = mock_parse_file
+
+    try:
+        result = pdf_parser.parse_file(Path("test.pdf"))
+        assert result == "Test PDF content page 1\nTest PDF content page 2"
+    finally:
+        pdf_parser.parse_file = original_parse_file
+
+
+@patch("application.parser.file.docs_parser.settings")
+def test_parse_pdf_pypdf_import_error(mock_settings, pdf_parser):
+    mock_settings.PARSE_PDF_AS_IMAGE = False
+
+    original_parse_file = pdf_parser.parse_file
+
+    def mock_parse_file(*args, **kwargs):
+        _ = args, kwargs
+        raise ValueError("pypdf is required to read PDF files.")
+
+    pdf_parser.parse_file = mock_parse_file
+
+    try:
+        with pytest.raises(ValueError, match="pypdf is required to read PDF files"):
+            pdf_parser.parse_file(Path("test.pdf"))
+    finally:
+        pdf_parser.parse_file = original_parse_file
+
+
+def test_parse_docx(docx_parser):
+    original_parse_file = docx_parser.parse_file
+
+    def mock_parse_file(*args, **kwargs):
+        _ = args, kwargs
+        return "Test DOCX content"
+
+    docx_parser.parse_file = mock_parse_file
+
+    try:
+        result = docx_parser.parse_file(Path("test.docx"))
+        assert result == "Test DOCX content"
+    finally:
+        docx_parser.parse_file = original_parse_file
+
+
+def test_parse_docx_import_error(docx_parser):
+    original_parse_file = docx_parser.parse_file
+
+    def mock_parse_file(*args, **kwargs):
+        _ = args, kwargs
+        raise ValueError("docx2txt is required to read Microsoft Word files.")
+
+    docx_parser.parse_file = mock_parse_file
+
+    try:
+        with pytest.raises(ValueError, match="docx2txt is required to read Microsoft Word files"):
+            docx_parser.parse_file(Path("test.docx"))
+    finally:
+        docx_parser.parse_file = original_parse_file
--- a/tests/parser/file/test_pptx_parser.py
+++ b/tests/parser/file/test_pptx_parser.py
@@ -0,0 +1,61 @@
+import pytest
+from pathlib import Path
+from unittest.mock import patch
+
+from application.parser.file.pptx_parser import PPTXParser
+
+
+def test_pptx_init_parser():
+    parser = PPTXParser()
+    assert isinstance(parser._init_parser(), dict)
+    assert not parser.parser_config_set
+    parser.init_parser()
+    assert parser.parser_config_set
+
+
+def _fake_presentation_with(slides_shapes_texts):
+    class Shape:
+        def __init__(self, text=None):
+            if text is not None:
+                self.text = text
+    class Slide:
+        def __init__(self, texts):
+            self.shapes = [Shape(t) for t in texts]
+    class Pres:
+        def __init__(self, _file):
+            self.slides = [Slide(texts) for texts in slides_shapes_texts]
+    return Pres
+
+
+def test_pptx_parser_concat_true():
+    slides = [["Hello ", "World"], ["Slide2"]]
+    FakePres = _fake_presentation_with(slides)
+    import sys, types
+    fake_pptx = types.ModuleType("pptx")
+    fake_pptx.Presentation = FakePres
+    parser = PPTXParser()
+    with patch.dict(sys.modules, {"pptx": fake_pptx}):
+        result = parser.parse_file(Path("deck.pptx"))
+    assert result == "Hello World\nSlide2"
+
+
+def test_pptx_parser_list_mode():
+    slides = [[" A ", "B"], [" C "]]
+    FakePres = _fake_presentation_with(slides)
+    import sys, types
+    fake_pptx = types.ModuleType("pptx")
+    fake_pptx.Presentation = FakePres
+    parser = PPTXParser()
+    parser._concat_slides = False
+    with patch.dict(sys.modules, {"pptx": fake_pptx}):
+        result = parser.parse_file(Path("deck.pptx"))
+    assert result == ["A B", "C"]
+
+
+def test_pptx_parser_import_error():
+    parser = PPTXParser()
+    import sys
+    with patch.dict(sys.modules, {"pptx": None}):
+        with pytest.raises(ImportError, match="pptx module is required to read .PPTX files"):
+            parser.parse_file(Path("missing.pptx"))
+
--- a/tests/parser/file/test_tabular_parser.py
+++ b/tests/parser/file/test_tabular_parser.py
@@ -0,0 +1,215 @@
+import pytest
+from pathlib import Path
+from unittest.mock import patch, MagicMock, mock_open
+
+from application.parser.file.tabular_parser import CSVParser, PandasCSVParser, ExcelParser
+
+
+@pytest.fixture
+def csv_parser():
+    return CSVParser()
+
+
+@pytest.fixture
+def pandas_csv_parser():
+    return PandasCSVParser()
+
+
+@pytest.fixture
+def excel_parser():
+    return ExcelParser()
+
+def test_csv_init_parser():
+    parser = CSVParser()
+    assert isinstance(parser._init_parser(), dict)
+    assert not parser.parser_config_set
+    parser.init_parser()
+    assert parser.parser_config_set
+
+
+def test_pandas_csv_init_parser():
+    parser = PandasCSVParser()
+    assert isinstance(parser._init_parser(), dict)
+    assert not parser.parser_config_set
+    parser.init_parser()
+    assert parser.parser_config_set
+
+
+def test_excel_init_parser():
+    parser = ExcelParser()
+    assert isinstance(parser._init_parser(), dict)
+    assert not parser.parser_config_set
+    parser.init_parser()
+    assert parser.parser_config_set
+
+
+def test_csv_parser_concat_rows(csv_parser):
+    mock_data = "col1,col2\nvalue1,value2\nvalue3,value4"
+
+    with patch("builtins.open", mock_open(read_data=mock_data)):
+        result = csv_parser.parse_file(Path("test.csv"))
+        assert result == "col1, col2\nvalue1, value2\nvalue3, value4"
+
+
+def test_csv_parser_separate_rows(csv_parser):
+    csv_parser._concat_rows = False
+    mock_data = "col1,col2\nvalue1,value2\nvalue3,value4"
+
+    with patch("builtins.open", mock_open(read_data=mock_data)):
+        result = csv_parser.parse_file(Path("test.csv"))
+        assert result == ["col1, col2", "value1, value2", "value3, value4"]
+
+
+
+
+def test_pandas_csv_parser_concat_rows(pandas_csv_parser):
+    mock_df = MagicMock()
+    mock_df.columns.tolist.return_value = ["col1", "col2"]
+    mock_df.iterrows.return_value = [
+        (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value1", "value2"]))),
+        (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value3", "value4"])))
+    ]
+
+    with patch("pandas.read_csv", return_value=mock_df):
+        result = pandas_csv_parser.parse_file(Path("test.csv"))
+        expected = "HEADERS: col1, col2\nvalue1, value2\nvalue3, value4"
+        assert result == expected
+
+
+def test_pandas_csv_parser_separate_rows(pandas_csv_parser):
+    pandas_csv_parser._concat_rows = False
+    mock_df = MagicMock()
+    mock_df.apply.return_value.tolist.return_value = ["value1, value2", "value3, value4"]
+
+    with patch("pandas.read_csv", return_value=mock_df):
+        result = pandas_csv_parser.parse_file(Path("test.csv"))
+        assert result == ["value1, value2", "value3, value4"]
+
+
+def test_pandas_csv_parser_header_period(pandas_csv_parser):
+    pandas_csv_parser._header_period = 2
+
+    mock_df = MagicMock()
+    mock_df.columns.tolist.return_value = ["col1", "col2"]
+    mock_df.iterrows.return_value = [
+        (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value1", "value2"]))),
+        (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value3", "value4"]))),
+        (2, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value5", "value6"])))
+    ]
+    mock_df.__len__.return_value = 3
+
+    with patch("pandas.read_csv", return_value=mock_df):
+        result = pandas_csv_parser.parse_file(Path("test.csv"))
+        expected = "HEADERS: col1, col2\nvalue1, value2\nvalue3, value4\nHEADERS: col1, col2\nvalue5, value6"
+        assert result == expected
+
+
+def test_excel_parser_concat_rows(excel_parser):
+    mock_df = MagicMock()
+    mock_df.columns.tolist.return_value = ["col1", "col2"]
+    mock_df.iterrows.return_value = [
+        (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value1", "value2"]))),
+        (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value3", "value4"])))
+    ]
+
+    with patch("pandas.read_excel", return_value=mock_df):
+        result = excel_parser.parse_file(Path("test.xlsx"))
+        expected = "HEADERS: col1, col2\nvalue1, value2\nvalue3, value4"
+        assert result == expected
+
+
+def test_excel_parser_separate_rows(excel_parser):
+    excel_parser._concat_rows = False
+    mock_df = MagicMock()
+    mock_df.apply.return_value.tolist.return_value = ["value1, value2", "value3, value4"]
+
+    with patch("pandas.read_excel", return_value=mock_df):
+        result = excel_parser.parse_file(Path("test.xlsx"))
+        assert result == ["value1, value2", "value3, value4"]
+
+
+def test_excel_parser_header_period(excel_parser):
+    excel_parser._header_period = 1
+
+    mock_df = MagicMock()
+    mock_df.columns.tolist.return_value = ["col1", "col2"]
+    mock_df.iterrows.return_value = [
+        (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value1", "value2"]))),
+        (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value3", "value4"])))
+    ]
+    mock_df.__len__.return_value = 2
+
+    with patch("pandas.read_excel", return_value=mock_df):
+        result = excel_parser.parse_file(Path("test.xlsx"))
+        expected = "value1, value2\nHEADERS: col1, col2\nvalue3, value4"
+        assert result == expected
+
+def test_csv_parser_import_error(csv_parser):
+    import sys
+    with patch.dict(sys.modules, {"csv": None}):
+        with pytest.raises(ValueError, match="csv module is required to read CSV files"):
+            csv_parser.parse_file(Path("test.csv"))
+
+
+def test_pandas_csv_parser_import_error(pandas_csv_parser):
+    import sys
+    with patch.dict(sys.modules, {"pandas": None}):
+        with pytest.raises(ValueError, match="pandas module is required to read CSV files"):
+            pandas_csv_parser.parse_file(Path("test.csv"))
+
+
+def test_pandas_csv_parser_header_period_zero(pandas_csv_parser):
+    pandas_csv_parser._header_period = 0
+    mock_df = MagicMock()
+    mock_df.columns.tolist.return_value = ["c1", "c2"]
+    mock_df.iterrows.return_value = [
+        (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["v1", "v2"]))),
+        (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["v3", "v4"]))),
+    ]
+    with patch("pandas.read_csv", return_value=mock_df):
+        result = pandas_csv_parser.parse_file(Path("f.csv"))
+    assert result == "HEADERS: c1, c2\nv1, v2\nv3, v4"
+
+
+def test_pandas_csv_parser_header_period_one(pandas_csv_parser):
+    pandas_csv_parser._header_period = 1
+    mock_df = MagicMock()
+    mock_df.columns.tolist.return_value = ["a", "b"]
+    mock_df.iterrows.return_value = [
+        (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["x", "y"]))),
+        (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["m", "n"]))),
+    ]
+    mock_df.__len__.return_value = 2
+    with patch("pandas.read_csv", return_value=mock_df):
+        result = pandas_csv_parser.parse_file(Path("f.csv"))
+    assert result == "x, y\nHEADERS: a, b\nm, n"
+
+
+def test_pandas_csv_parser_passes_pandas_config():
+    parser = PandasCSVParser(pandas_config={"sep": ";", "header": 0})
+    mock_df = MagicMock()
+    with patch("pandas.read_csv", return_value=mock_df) as mock_read:
+        parser.parse_file(Path("conf.csv"))
+        kwargs = mock_read.call_args.kwargs
+        assert kwargs.get("sep") == ";"
+        assert kwargs.get("header") == 0
+
+
+def test_excel_parser_custom_joiners_and_prefix(excel_parser):
+    excel_parser._col_joiner = " | "
+    excel_parser._row_joiner = " || "
+    excel_parser._header_prefix = "COLUMNS: "
+    mock_df = MagicMock()
+    mock_df.columns.tolist.return_value = ["A", "B"]
+    mock_df.iterrows.return_value = [
+        (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["x", "y"]))),
+    ]
+    with patch("pandas.read_excel", return_value=mock_df):
+        result = excel_parser.parse_file(Path("t.xlsx"))
+    assert result == "COLUMNS: A | B || x | y"
+
+def test_excel_parser_import_error(excel_parser):
+    import sys
+    with patch.dict(sys.modules, {"pandas": None}):
+        with pytest.raises(ValueError, match="pandas module is required to read Excel files"):
+            excel_parser.parse_file(Path("test.xlsx"))