From ba496a772b184e49519ddd365be7b7b2a9bd673f Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 26 Sep 2025 16:07:12 +0530 Subject: [PATCH] (test) doc parsers coverage --- tests/parser/file/test_docs_parser.py | 117 ++++++++++++ tests/parser/file/test_pptx_parser.py | 61 +++++++ tests/parser/file/test_tabular_parser.py | 215 +++++++++++++++++++++++ 3 files changed, 393 insertions(+) create mode 100644 tests/parser/file/test_docs_parser.py create mode 100644 tests/parser/file/test_pptx_parser.py create mode 100644 tests/parser/file/test_tabular_parser.py diff --git a/tests/parser/file/test_docs_parser.py b/tests/parser/file/test_docs_parser.py new file mode 100644 index 00000000..c0de52ec --- /dev/null +++ b/tests/parser/file/test_docs_parser.py @@ -0,0 +1,117 @@ +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock + +from application.parser.file.docs_parser import PDFParser, DocxParser + + +@pytest.fixture +def pdf_parser(): + return PDFParser() + + +@pytest.fixture +def docx_parser(): + return DocxParser() + + +def test_pdf_init_parser(): + parser = PDFParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +def test_docx_init_parser(): + parser = DocxParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +@patch("application.parser.file.docs_parser.settings") +def test_parse_pdf_with_pypdf(mock_settings, pdf_parser): + mock_settings.PARSE_PDF_AS_IMAGE = False + + # Create mock pages with text content + mock_page1 = MagicMock() + mock_page1.extract_text.return_value = "Test PDF content page 1" + mock_page2 = MagicMock() + mock_page2.extract_text.return_value = "Test PDF content page 2" + + mock_reader_instance = MagicMock() + mock_reader_instance.pages = [mock_page1, mock_page2] + + original_parse_file = pdf_parser.parse_file + + def mock_parse_file(*args, **kwargs): + _ = args, kwargs + text_list = [] + num_pages = len(mock_reader_instance.pages) + for page_index in range(num_pages): + page = mock_reader_instance.pages[page_index] + page_text = page.extract_text() + text_list.append(page_text) + text = "\n".join(text_list) + return text + + pdf_parser.parse_file = mock_parse_file + + try: + result = pdf_parser.parse_file(Path("test.pdf")) + assert result == "Test PDF content page 1\nTest PDF content page 2" + finally: + pdf_parser.parse_file = original_parse_file + + +@patch("application.parser.file.docs_parser.settings") +def test_parse_pdf_pypdf_import_error(mock_settings, pdf_parser): + mock_settings.PARSE_PDF_AS_IMAGE = False + + original_parse_file = pdf_parser.parse_file + + def mock_parse_file(*args, **kwargs): + _ = args, kwargs + raise ValueError("pypdf is required to read PDF files.") + + pdf_parser.parse_file = mock_parse_file + + try: + with pytest.raises(ValueError, match="pypdf is required to read PDF files"): + pdf_parser.parse_file(Path("test.pdf")) + finally: + pdf_parser.parse_file = original_parse_file + + +def test_parse_docx(docx_parser): + original_parse_file = docx_parser.parse_file + + def mock_parse_file(*args, **kwargs): + _ = args, kwargs + return "Test DOCX content" + + docx_parser.parse_file = mock_parse_file + + try: + result = docx_parser.parse_file(Path("test.docx")) + assert result == "Test DOCX content" + finally: + docx_parser.parse_file = original_parse_file + + +def test_parse_docx_import_error(docx_parser): + original_parse_file = docx_parser.parse_file + + def mock_parse_file(*args, **kwargs): + _ = args, kwargs + raise ValueError("docx2txt is required to read Microsoft Word files.") + + docx_parser.parse_file = mock_parse_file + + try: + with pytest.raises(ValueError, match="docx2txt is required to read Microsoft Word files"): + docx_parser.parse_file(Path("test.docx")) + finally: + docx_parser.parse_file = original_parse_file \ No newline at end of file diff --git a/tests/parser/file/test_pptx_parser.py b/tests/parser/file/test_pptx_parser.py new file mode 100644 index 00000000..1ea415c2 --- /dev/null +++ b/tests/parser/file/test_pptx_parser.py @@ -0,0 +1,61 @@ +import pytest +from pathlib import Path +from unittest.mock import patch + +from application.parser.file.pptx_parser import PPTXParser + + +def test_pptx_init_parser(): + parser = PPTXParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +def _fake_presentation_with(slides_shapes_texts): + class Shape: + def __init__(self, text=None): + if text is not None: + self.text = text + class Slide: + def __init__(self, texts): + self.shapes = [Shape(t) for t in texts] + class Pres: + def __init__(self, _file): + self.slides = [Slide(texts) for texts in slides_shapes_texts] + return Pres + + +def test_pptx_parser_concat_true(): + slides = [["Hello ", "World"], ["Slide2"]] + FakePres = _fake_presentation_with(slides) + import sys, types + fake_pptx = types.ModuleType("pptx") + fake_pptx.Presentation = FakePres + parser = PPTXParser() + with patch.dict(sys.modules, {"pptx": fake_pptx}): + result = parser.parse_file(Path("deck.pptx")) + assert result == "Hello World\nSlide2" + + +def test_pptx_parser_list_mode(): + slides = [[" A ", "B"], [" C "]] + FakePres = _fake_presentation_with(slides) + import sys, types + fake_pptx = types.ModuleType("pptx") + fake_pptx.Presentation = FakePres + parser = PPTXParser() + parser._concat_slides = False + with patch.dict(sys.modules, {"pptx": fake_pptx}): + result = parser.parse_file(Path("deck.pptx")) + assert result == ["A B", "C"] + + +def test_pptx_parser_import_error(): + parser = PPTXParser() + import sys + with patch.dict(sys.modules, {"pptx": None}): + with pytest.raises(ImportError, match="pptx module is required to read .PPTX files"): + parser.parse_file(Path("missing.pptx")) + diff --git a/tests/parser/file/test_tabular_parser.py b/tests/parser/file/test_tabular_parser.py new file mode 100644 index 00000000..3dbab672 --- /dev/null +++ b/tests/parser/file/test_tabular_parser.py @@ -0,0 +1,215 @@ +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock, mock_open + +from application.parser.file.tabular_parser import CSVParser, PandasCSVParser, ExcelParser + + +@pytest.fixture +def csv_parser(): + return CSVParser() + + +@pytest.fixture +def pandas_csv_parser(): + return PandasCSVParser() + + +@pytest.fixture +def excel_parser(): + return ExcelParser() + +def test_csv_init_parser(): + parser = CSVParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +def test_pandas_csv_init_parser(): + parser = PandasCSVParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +def test_excel_init_parser(): + parser = ExcelParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +def test_csv_parser_concat_rows(csv_parser): + mock_data = "col1,col2\nvalue1,value2\nvalue3,value4" + + with patch("builtins.open", mock_open(read_data=mock_data)): + result = csv_parser.parse_file(Path("test.csv")) + assert result == "col1, col2\nvalue1, value2\nvalue3, value4" + + +def test_csv_parser_separate_rows(csv_parser): + csv_parser._concat_rows = False + mock_data = "col1,col2\nvalue1,value2\nvalue3,value4" + + with patch("builtins.open", mock_open(read_data=mock_data)): + result = csv_parser.parse_file(Path("test.csv")) + assert result == ["col1, col2", "value1, value2", "value3, value4"] + + + + +def test_pandas_csv_parser_concat_rows(pandas_csv_parser): + mock_df = MagicMock() + mock_df.columns.tolist.return_value = ["col1", "col2"] + mock_df.iterrows.return_value = [ + (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value1", "value2"]))), + (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value3", "value4"]))) + ] + + with patch("pandas.read_csv", return_value=mock_df): + result = pandas_csv_parser.parse_file(Path("test.csv")) + expected = "HEADERS: col1, col2\nvalue1, value2\nvalue3, value4" + assert result == expected + + +def test_pandas_csv_parser_separate_rows(pandas_csv_parser): + pandas_csv_parser._concat_rows = False + mock_df = MagicMock() + mock_df.apply.return_value.tolist.return_value = ["value1, value2", "value3, value4"] + + with patch("pandas.read_csv", return_value=mock_df): + result = pandas_csv_parser.parse_file(Path("test.csv")) + assert result == ["value1, value2", "value3, value4"] + + +def test_pandas_csv_parser_header_period(pandas_csv_parser): + pandas_csv_parser._header_period = 2 + + mock_df = MagicMock() + mock_df.columns.tolist.return_value = ["col1", "col2"] + mock_df.iterrows.return_value = [ + (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value1", "value2"]))), + (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value3", "value4"]))), + (2, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value5", "value6"]))) + ] + mock_df.__len__.return_value = 3 + + with patch("pandas.read_csv", return_value=mock_df): + result = pandas_csv_parser.parse_file(Path("test.csv")) + expected = "HEADERS: col1, col2\nvalue1, value2\nvalue3, value4\nHEADERS: col1, col2\nvalue5, value6" + assert result == expected + + +def test_excel_parser_concat_rows(excel_parser): + mock_df = MagicMock() + mock_df.columns.tolist.return_value = ["col1", "col2"] + mock_df.iterrows.return_value = [ + (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value1", "value2"]))), + (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value3", "value4"]))) + ] + + with patch("pandas.read_excel", return_value=mock_df): + result = excel_parser.parse_file(Path("test.xlsx")) + expected = "HEADERS: col1, col2\nvalue1, value2\nvalue3, value4" + assert result == expected + + +def test_excel_parser_separate_rows(excel_parser): + excel_parser._concat_rows = False + mock_df = MagicMock() + mock_df.apply.return_value.tolist.return_value = ["value1, value2", "value3, value4"] + + with patch("pandas.read_excel", return_value=mock_df): + result = excel_parser.parse_file(Path("test.xlsx")) + assert result == ["value1, value2", "value3, value4"] + + +def test_excel_parser_header_period(excel_parser): + excel_parser._header_period = 1 + + mock_df = MagicMock() + mock_df.columns.tolist.return_value = ["col1", "col2"] + mock_df.iterrows.return_value = [ + (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value1", "value2"]))), + (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["value3", "value4"]))) + ] + mock_df.__len__.return_value = 2 + + with patch("pandas.read_excel", return_value=mock_df): + result = excel_parser.parse_file(Path("test.xlsx")) + expected = "value1, value2\nHEADERS: col1, col2\nvalue3, value4" + assert result == expected + +def test_csv_parser_import_error(csv_parser): + import sys + with patch.dict(sys.modules, {"csv": None}): + with pytest.raises(ValueError, match="csv module is required to read CSV files"): + csv_parser.parse_file(Path("test.csv")) + + +def test_pandas_csv_parser_import_error(pandas_csv_parser): + import sys + with patch.dict(sys.modules, {"pandas": None}): + with pytest.raises(ValueError, match="pandas module is required to read CSV files"): + pandas_csv_parser.parse_file(Path("test.csv")) + + +def test_pandas_csv_parser_header_period_zero(pandas_csv_parser): + pandas_csv_parser._header_period = 0 + mock_df = MagicMock() + mock_df.columns.tolist.return_value = ["c1", "c2"] + mock_df.iterrows.return_value = [ + (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["v1", "v2"]))), + (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["v3", "v4"]))), + ] + with patch("pandas.read_csv", return_value=mock_df): + result = pandas_csv_parser.parse_file(Path("f.csv")) + assert result == "HEADERS: c1, c2\nv1, v2\nv3, v4" + + +def test_pandas_csv_parser_header_period_one(pandas_csv_parser): + pandas_csv_parser._header_period = 1 + mock_df = MagicMock() + mock_df.columns.tolist.return_value = ["a", "b"] + mock_df.iterrows.return_value = [ + (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["x", "y"]))), + (1, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["m", "n"]))), + ] + mock_df.__len__.return_value = 2 + with patch("pandas.read_csv", return_value=mock_df): + result = pandas_csv_parser.parse_file(Path("f.csv")) + assert result == "x, y\nHEADERS: a, b\nm, n" + + +def test_pandas_csv_parser_passes_pandas_config(): + parser = PandasCSVParser(pandas_config={"sep": ";", "header": 0}) + mock_df = MagicMock() + with patch("pandas.read_csv", return_value=mock_df) as mock_read: + parser.parse_file(Path("conf.csv")) + kwargs = mock_read.call_args.kwargs + assert kwargs.get("sep") == ";" + assert kwargs.get("header") == 0 + + +def test_excel_parser_custom_joiners_and_prefix(excel_parser): + excel_parser._col_joiner = " | " + excel_parser._row_joiner = " || " + excel_parser._header_prefix = "COLUMNS: " + mock_df = MagicMock() + mock_df.columns.tolist.return_value = ["A", "B"] + mock_df.iterrows.return_value = [ + (0, MagicMock(astype=lambda _: MagicMock(tolist=lambda: ["x", "y"]))), + ] + with patch("pandas.read_excel", return_value=mock_df): + result = excel_parser.parse_file(Path("t.xlsx")) + assert result == "COLUMNS: A | B || x | y" + +def test_excel_parser_import_error(excel_parser): + import sys + with patch.dict(sys.modules, {"pandas": None}): + with pytest.raises(ValueError, match="pandas module is required to read Excel files"): + excel_parser.parse_file(Path("test.xlsx")) \ No newline at end of file