From 2b38f80d04d3549d7d7d6891ca2694386c4e0f20 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Mon, 29 Sep 2025 17:39:20 +0530 Subject: [PATCH] (test:files) epub, image, rst --- tests/parser/file/test_epub_parser.py | 152 +++++++++++++ tests/parser/file/test_image_parser.py | 42 ++++ tests/parser/file/test_rst_parser.py | 284 +++++++++++++++++++++++++ 3 files changed, 478 insertions(+) create mode 100644 tests/parser/file/test_epub_parser.py create mode 100644 tests/parser/file/test_image_parser.py create mode 100644 tests/parser/file/test_rst_parser.py diff --git a/tests/parser/file/test_epub_parser.py b/tests/parser/file/test_epub_parser.py new file mode 100644 index 00000000..519ae2a4 --- /dev/null +++ b/tests/parser/file/test_epub_parser.py @@ -0,0 +1,152 @@ +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock +import sys +import types + +from application.parser.file.epub_parser import EpubParser + + +@pytest.fixture +def epub_parser(): + return EpubParser() + + +def test_epub_init_parser(): + parser = EpubParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +def test_epub_parser_ebooklib_import_error(epub_parser): + """Test that ImportError is raised when ebooklib is not available.""" + with patch.dict(sys.modules, {"ebooklib": None}): + with pytest.raises(ValueError, match="`EbookLib` is required to read Epub files"): + epub_parser.parse_file(Path("test.epub")) + + +def test_epub_parser_html2text_import_error(epub_parser): + """Test that ImportError is raised when html2text is not available.""" + fake_ebooklib = types.ModuleType("ebooklib") + fake_epub = types.ModuleType("ebooklib.epub") + fake_ebooklib.epub = fake_epub + + with patch.dict(sys.modules, {"ebooklib": fake_ebooklib, "ebooklib.epub": fake_epub}): + with patch.dict(sys.modules, {"html2text": None}): + with pytest.raises(ValueError, match="`html2text` is required to parse Epub files"): + epub_parser.parse_file(Path("test.epub")) + + +def test_epub_parser_successful_parsing(epub_parser): + """Test successful parsing of an epub file.""" + + fake_ebooklib = types.ModuleType("ebooklib") + fake_epub = types.ModuleType("ebooklib.epub") + fake_html2text = types.ModuleType("html2text") + + # Mock ebooklib constants + fake_ebooklib.ITEM_DOCUMENT = "document" + fake_ebooklib.epub = fake_epub + + mock_item1 = MagicMock() + mock_item1.get_type.return_value = "document" + mock_item1.get_content.return_value = b"

Chapter 1

Content 1

" + + mock_item2 = MagicMock() + mock_item2.get_type.return_value = "document" + mock_item2.get_content.return_value = b"

Chapter 2

Content 2

" + + mock_item3 = MagicMock() + mock_item3.get_type.return_value = "other" # Should be ignored + mock_item3.get_content.return_value = b"

Other content

" + + mock_book = MagicMock() + mock_book.get_items.return_value = [mock_item1, mock_item2, mock_item3] + + fake_epub.read_epub = MagicMock(return_value=mock_book) + + def mock_html2text_func(html_content): + if "Chapter 1" in html_content: + return "# Chapter 1\n\nContent 1\n" + elif "Chapter 2" in html_content: + return "# Chapter 2\n\nContent 2\n" + return "Other content\n" + + fake_html2text.html2text = mock_html2text_func + + with patch.dict(sys.modules, { + "ebooklib": fake_ebooklib, + "ebooklib.epub": fake_epub, + "html2text": fake_html2text + }): + result = epub_parser.parse_file(Path("test.epub")) + + expected_result = "# Chapter 1\n\nContent 1\n\n# Chapter 2\n\nContent 2\n" + assert result == expected_result + + # Verify epub.read_epub was called with correct parameters + fake_epub.read_epub.assert_called_once_with(Path("test.epub"), options={"ignore_ncx": True}) + + +def test_epub_parser_empty_book(epub_parser): + """Test parsing an epub file with no document items.""" + # Create mock modules + fake_ebooklib = types.ModuleType("ebooklib") + fake_epub = types.ModuleType("ebooklib.epub") + fake_html2text = types.ModuleType("html2text") + + fake_ebooklib.ITEM_DOCUMENT = "document" + fake_ebooklib.epub = fake_epub + + # Create mock book with no document items + mock_book = MagicMock() + mock_book.get_items.return_value = [] + + fake_epub.read_epub = MagicMock(return_value=mock_book) + fake_html2text.html2text = MagicMock() + + with patch.dict(sys.modules, { + "ebooklib": fake_ebooklib, + "ebooklib.epub": fake_epub, + "html2text": fake_html2text + }): + result = epub_parser.parse_file(Path("empty.epub")) + assert result == "" + + fake_html2text.html2text.assert_not_called() + + +def test_epub_parser_non_document_items_ignored(epub_parser): + """Test that non-document items are ignored during parsing.""" + fake_ebooklib = types.ModuleType("ebooklib") + fake_epub = types.ModuleType("ebooklib.epub") + fake_html2text = types.ModuleType("html2text") + + fake_ebooklib.ITEM_DOCUMENT = "document" + fake_ebooklib.epub = fake_epub + + mock_doc_item = MagicMock() + mock_doc_item.get_type.return_value = "document" + mock_doc_item.get_content.return_value = b"

Document content

" + + mock_other_item = MagicMock() + mock_other_item.get_type.return_value = "image" # Not a document + + mock_book = MagicMock() + mock_book.get_items.return_value = [mock_other_item, mock_doc_item] + + fake_epub.read_epub = MagicMock(return_value=mock_book) + fake_html2text.html2text = MagicMock(return_value="Document content\n") + + with patch.dict(sys.modules, { + "ebooklib": fake_ebooklib, + "ebooklib.epub": fake_epub, + "html2text": fake_html2text + }): + result = epub_parser.parse_file(Path("test.epub")) + + assert result == "Document content\n" + + fake_html2text.html2text.assert_called_once_with("

Document content

") diff --git a/tests/parser/file/test_image_parser.py b/tests/parser/file/test_image_parser.py new file mode 100644 index 00000000..c5df48b4 --- /dev/null +++ b/tests/parser/file/test_image_parser.py @@ -0,0 +1,42 @@ +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock, mock_open + +from application.parser.file.image_parser import ImageParser + + +def test_image_init_parser(): + parser = ImageParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +@patch("application.parser.file.image_parser.settings") +def test_image_parser_remote_true(mock_settings): + mock_settings.PARSE_IMAGE_REMOTE = True + parser = ImageParser() + + mock_response = MagicMock() + mock_response.json.return_value = {"markdown": "# From Image"} + + with patch("application.parser.file.image_parser.requests.post", return_value=mock_response) as mock_post: + with patch("builtins.open", mock_open()): + result = parser.parse_file(Path("img.png")) + + assert result == "# From Image" + mock_post.assert_called_once() + + +@patch("application.parser.file.image_parser.settings") +def test_image_parser_remote_false(mock_settings): + mock_settings.PARSE_IMAGE_REMOTE = False + parser = ImageParser() + + with patch("application.parser.file.image_parser.requests.post") as mock_post: + result = parser.parse_file(Path("img.png")) + + assert result == "" + mock_post.assert_not_called() + diff --git a/tests/parser/file/test_rst_parser.py b/tests/parser/file/test_rst_parser.py new file mode 100644 index 00000000..b7466234 --- /dev/null +++ b/tests/parser/file/test_rst_parser.py @@ -0,0 +1,284 @@ +import pytest +from pathlib import Path +from unittest.mock import patch, mock_open + +from application.parser.file.rst_parser import RstParser + + +@pytest.fixture +def rst_parser(): + return RstParser() + + +@pytest.fixture +def rst_parser_custom(): + return RstParser( + remove_hyperlinks=False, + remove_images=False, + remove_table_excess=False, + remove_interpreters=False, + remove_directives=False, + remove_whitespaces_excess=False, + remove_characters_excess=False + ) + + +def test_rst_init_parser(): + parser = RstParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +def test_rst_parser_initialization_with_custom_options(): + """Test RstParser initialization with custom options.""" + parser = RstParser( + remove_hyperlinks=False, + remove_images=False, + remove_table_excess=False, + remove_interpreters=False, + remove_directives=False, + remove_whitespaces_excess=False, + remove_characters_excess=False + ) + + assert not parser._remove_hyperlinks + assert not parser._remove_images + assert not parser._remove_table_excess + assert not parser._remove_interpreters + assert not parser._remove_directives + assert not parser._remove_whitespaces_excess + assert not parser._remove_characters_excess + + +def test_rst_parser_default_initialization(): + """Test RstParser initialization with default options.""" + parser = RstParser() + + assert parser._remove_hyperlinks + assert parser._remove_images + assert parser._remove_table_excess + assert parser._remove_interpreters + assert parser._remove_directives + assert parser._remove_whitespaces_excess + assert parser._remove_characters_excess + + +def test_remove_hyperlinks(): + """Test hyperlink removal functionality.""" + parser = RstParser() + content = "This is a `link text `_ and more text." + result = parser.remove_hyperlinks(content) + assert result == "This is a link text and more text." + + +def test_remove_images(): + """Test image removal functionality.""" + parser = RstParser() + content = "Some text\n.. image:: path/to/image.png\nMore text" + result = parser.remove_images(content) + assert result == "Some text\n\nMore text" + + +def test_remove_directives(): + """Test directive removal functionality.""" + parser = RstParser() + content = "Text with `..note::` directive and more text" + result = parser.remove_directives(content) + # The regex pattern looks for `..something::` so it should remove `..note::` + assert result == "Text with ` directive and more text" + + +def test_remove_interpreters(): + """Test interpreter removal functionality.""" + parser = RstParser() + content = "Text with :doc: role and :ref: another role" + result = parser.remove_interpreters(content) + assert result == "Text with role and another role" + + +def test_remove_table_excess(): + """Test table separator removal functionality.""" + parser = RstParser() + content = "Header\n+-----+-----+\n| A | B |\n+-----+-----+\nFooter" + result = parser.remove_table_excess(content) + assert "+-----+-----+" not in result + assert "Header" in result + assert "| A | B |" in result + assert "Footer" in result + + +def test_chunk_by_token_count(): + """Test token-based chunking functionality.""" + parser = RstParser() + text = "This is a long text that should be chunked into smaller pieces based on token count" + chunks = parser.chunk_by_token_count(text, max_tokens=5) + + # Should create multiple chunks + assert len(chunks) > 1 + + # Each chunk should be reasonably sized (approximately 5 * 5 = 25 characters) + for chunk in chunks: + assert len(chunk) <= 30 # Allow some flexibility + + +def test_rst_to_tups_with_headers(): + """Test RST to tuples conversion with headers.""" + parser = RstParser() + rst_content = """Introduction +============ + +This is the introduction text. + +Chapter 1 +========= + +This is chapter 1 content. +More content here. + +Chapter 2 +========= + +This is chapter 2 content.""" + + tups = parser.rst_to_tups(rst_content) + + # Should have 3 tuples (intro, chapter 1, chapter 2) + assert len(tups) >= 2 + + # Check that headers are captured + headers = [tup[0] for tup in tups if tup[0] is not None] + assert "Introduction" in headers + assert "Chapter 1" in headers + assert "Chapter 2" in headers + + +def test_rst_to_tups_without_headers(): + """Test RST to tuples conversion without headers.""" + parser = RstParser() + rst_content = "Just plain text without any headers or structure." + + tups = parser.rst_to_tups(rst_content) + + # Should have one tuple with None header + assert len(tups) == 1 + assert tups[0][0] is None + assert "Just plain text" in tups[0][1] + + +def test_parse_file_basic(rst_parser): + """Test basic parse_file functionality.""" + content = """Title +===== + +This is some content. + +Subtitle +-------- + +More content here.""" + + with patch("builtins.open", mock_open(read_data=content)): + result = rst_parser.parse_file(Path("test.rst")) + + # Should return a list of strings + assert isinstance(result, list) + assert len(result) >= 1 + + # Content should be processed and cleaned + joined_result = "\n".join(result) + assert "Title" in joined_result + assert "content" in joined_result + + +def test_parse_file_with_hyperlinks(rst_parser_custom): + """Test parse_file with hyperlinks when removal is disabled.""" + content = "Text with `link `_ here." + + with patch("builtins.open", mock_open(read_data=content)): + result = rst_parser_custom.parse_file(Path("test.rst")) + + joined_result = "\n".join(result) + # Hyperlinks should be preserved when removal is disabled + assert "http://example.com" in joined_result + + +def test_parse_tups_with_max_tokens(): + """Test parse_tups with token chunking.""" + parser = RstParser() + content = """Header +====== + +This is a very long piece of content that should be chunked into smaller pieces when max_tokens is specified. It contains multiple sentences and should be split appropriately.""" + + with patch("builtins.open", mock_open(read_data=content)): + tups = parser.parse_tups(Path("test.rst"), max_tokens=10) + + # Should create multiple chunks due to token limit + assert len(tups) > 1 + + # Each tuple should have a header indicating chunk number + chunk_headers = [tup[0] for tup in tups] + assert any("Chunk" in str(header) for header in chunk_headers if header) + + +def test_parse_tups_without_max_tokens(): + """Test parse_tups without token chunking.""" + parser = RstParser() + content = """Header +====== + +Content here.""" + + with patch("builtins.open", mock_open(read_data=content)): + tups = parser.parse_tups(Path("test.rst"), max_tokens=None) + + # Should not create additional chunks + assert len(tups) >= 1 + + # Headers should not contain "Chunk" + chunk_headers = [tup[0] for tup in tups] + assert not any("Chunk" in str(header) for header in chunk_headers if header) + + +def test_parse_file_empty_content(): + """Test parse_file with empty content.""" + parser = RstParser() + + with patch("builtins.open", mock_open(read_data="")): + result = parser.parse_file(Path("empty.rst")) + + # Should handle empty content gracefully + assert isinstance(result, list) + + +def test_all_cleaning_methods_applied(): + """Test that all cleaning methods are applied when enabled.""" + parser = RstParser() + content = """Title +===== + +Text with `link `_ and :doc:`reference`. + +.. image:: image.png + ++-----+-----+ +| A | B | ++-----+-----+ + +`..note::` This is a note.""" + + with patch("builtins.open", mock_open(read_data=content)): + result = parser.parse_file(Path("test.rst")) + + joined_result = "\n".join(result) + + # All unwanted elements should be removed + assert "http://example.com" not in joined_result # hyperlinks removed + assert ":doc:" not in joined_result # interpreters removed + assert ".. image::" not in joined_result # images removed + assert "+-----+" not in joined_result # table excess removed + # The directive pattern looks for `..something::` so regular .. note:: won't be removed + # but `..note::` will be removed + assert "`..note::`" not in joined_result # directives removed