(test:files) epub, image, rst

2026-03-07 22:33:36 +00:00 · 2025-09-29 17:39:20 +05:30
parent 282bd35f52
commit 2b38f80d04
3 changed files with 478 additions and 0 deletions
--- a/tests/parser/file/test_epub_parser.py
+++ b/tests/parser/file/test_epub_parser.py
@@ -0,0 +1,152 @@
+import pytest
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+import sys
+import types
+
+from application.parser.file.epub_parser import EpubParser
+
+
+@pytest.fixture
+def epub_parser():
+    return EpubParser()
+
+
+def test_epub_init_parser():
+    parser = EpubParser()
+    assert isinstance(parser._init_parser(), dict)
+    assert not parser.parser_config_set
+    parser.init_parser()
+    assert parser.parser_config_set
+
+
+def test_epub_parser_ebooklib_import_error(epub_parser):
+    """Test that ImportError is raised when ebooklib is not available."""
+    with patch.dict(sys.modules, {"ebooklib": None}):
+        with pytest.raises(ValueError, match="`EbookLib` is required to read Epub files"):
+            epub_parser.parse_file(Path("test.epub"))
+
+
+def test_epub_parser_html2text_import_error(epub_parser):
+    """Test that ImportError is raised when html2text is not available."""
+    fake_ebooklib = types.ModuleType("ebooklib")
+    fake_epub = types.ModuleType("ebooklib.epub")
+    fake_ebooklib.epub = fake_epub
+    
+    with patch.dict(sys.modules, {"ebooklib": fake_ebooklib, "ebooklib.epub": fake_epub}):
+        with patch.dict(sys.modules, {"html2text": None}):
+            with pytest.raises(ValueError, match="`html2text` is required to parse Epub files"):
+                epub_parser.parse_file(Path("test.epub"))
+
+
+def test_epub_parser_successful_parsing(epub_parser):
+    """Test successful parsing of an epub file."""
+
+    fake_ebooklib = types.ModuleType("ebooklib")
+    fake_epub = types.ModuleType("ebooklib.epub")
+    fake_html2text = types.ModuleType("html2text")
+    
+    # Mock ebooklib constants
+    fake_ebooklib.ITEM_DOCUMENT = "document"
+    fake_ebooklib.epub = fake_epub
+    
+    mock_item1 = MagicMock()
+    mock_item1.get_type.return_value = "document"
+    mock_item1.get_content.return_value = b"<h1>Chapter 1</h1><p>Content 1</p>"
+    
+    mock_item2 = MagicMock()
+    mock_item2.get_type.return_value = "document"
+    mock_item2.get_content.return_value = b"<h1>Chapter 2</h1><p>Content 2</p>"
+    
+    mock_item3 = MagicMock()
+    mock_item3.get_type.return_value = "other"  # Should be ignored
+    mock_item3.get_content.return_value = b"<p>Other content</p>"
+    
+    mock_book = MagicMock()
+    mock_book.get_items.return_value = [mock_item1, mock_item2, mock_item3]
+    
+    fake_epub.read_epub = MagicMock(return_value=mock_book)
+    
+    def mock_html2text_func(html_content):
+        if "Chapter 1" in html_content:
+            return "# Chapter 1\n\nContent 1\n"
+        elif "Chapter 2" in html_content:
+            return "# Chapter 2\n\nContent 2\n"
+        return "Other content\n"
+    
+    fake_html2text.html2text = mock_html2text_func
+    
+    with patch.dict(sys.modules, {
+        "ebooklib": fake_ebooklib,
+        "ebooklib.epub": fake_epub,
+        "html2text": fake_html2text
+    }):
+        result = epub_parser.parse_file(Path("test.epub"))
+    
+    expected_result = "# Chapter 1\n\nContent 1\n\n# Chapter 2\n\nContent 2\n"
+    assert result == expected_result
+    
+    # Verify epub.read_epub was called with correct parameters
+    fake_epub.read_epub.assert_called_once_with(Path("test.epub"), options={"ignore_ncx": True})
+
+
+def test_epub_parser_empty_book(epub_parser):
+    """Test parsing an epub file with no document items."""
+    # Create mock modules
+    fake_ebooklib = types.ModuleType("ebooklib")
+    fake_epub = types.ModuleType("ebooklib.epub")
+    fake_html2text = types.ModuleType("html2text")
+    
+    fake_ebooklib.ITEM_DOCUMENT = "document"
+    fake_ebooklib.epub = fake_epub
+    
+    # Create mock book with no document items
+    mock_book = MagicMock()
+    mock_book.get_items.return_value = []
+    
+    fake_epub.read_epub = MagicMock(return_value=mock_book)
+    fake_html2text.html2text = MagicMock()
+    
+    with patch.dict(sys.modules, {
+        "ebooklib": fake_ebooklib,
+        "ebooklib.epub": fake_epub,
+        "html2text": fake_html2text
+    }):
+        result = epub_parser.parse_file(Path("empty.epub"))
+    assert result == ""
+
+    fake_html2text.html2text.assert_not_called()
+
+
+def test_epub_parser_non_document_items_ignored(epub_parser):
+    """Test that non-document items are ignored during parsing."""
+    fake_ebooklib = types.ModuleType("ebooklib")
+    fake_epub = types.ModuleType("ebooklib.epub")
+    fake_html2text = types.ModuleType("html2text")
+    
+    fake_ebooklib.ITEM_DOCUMENT = "document"
+    fake_ebooklib.epub = fake_epub
+    
+    mock_doc_item = MagicMock()
+    mock_doc_item.get_type.return_value = "document"
+    mock_doc_item.get_content.return_value = b"<p>Document content</p>"
+    
+    mock_other_item = MagicMock()
+    mock_other_item.get_type.return_value = "image"  # Not a document
+    
+    mock_book = MagicMock()
+    mock_book.get_items.return_value = [mock_other_item, mock_doc_item]
+    
+    fake_epub.read_epub = MagicMock(return_value=mock_book)
+    fake_html2text.html2text = MagicMock(return_value="Document content\n")
+    
+    with patch.dict(sys.modules, {
+        "ebooklib": fake_ebooklib,
+        "ebooklib.epub": fake_epub,
+        "html2text": fake_html2text
+    }):
+        result = epub_parser.parse_file(Path("test.epub"))
+    
+    assert result == "Document content\n"
+    
+    fake_html2text.html2text.assert_called_once_with("<p>Document content</p>")
--- a/tests/parser/file/test_image_parser.py
+++ b/tests/parser/file/test_image_parser.py
@@ -0,0 +1,42 @@
+import pytest
+from pathlib import Path
+from unittest.mock import patch, MagicMock, mock_open
+
+from application.parser.file.image_parser import ImageParser
+
+
+def test_image_init_parser():
+    parser = ImageParser()
+    assert isinstance(parser._init_parser(), dict)
+    assert not parser.parser_config_set
+    parser.init_parser()
+    assert parser.parser_config_set
+
+
+@patch("application.parser.file.image_parser.settings")
+def test_image_parser_remote_true(mock_settings):
+    mock_settings.PARSE_IMAGE_REMOTE = True
+    parser = ImageParser()
+
+    mock_response = MagicMock()
+    mock_response.json.return_value = {"markdown": "# From Image"}
+
+    with patch("application.parser.file.image_parser.requests.post", return_value=mock_response) as mock_post:
+        with patch("builtins.open", mock_open()):
+            result = parser.parse_file(Path("img.png"))
+
+    assert result == "# From Image"
+    mock_post.assert_called_once()
+
+
+@patch("application.parser.file.image_parser.settings")
+def test_image_parser_remote_false(mock_settings):
+    mock_settings.PARSE_IMAGE_REMOTE = False
+    parser = ImageParser()
+
+    with patch("application.parser.file.image_parser.requests.post") as mock_post:
+        result = parser.parse_file(Path("img.png"))
+
+    assert result == ""
+    mock_post.assert_not_called()
+
--- a/tests/parser/file/test_rst_parser.py
+++ b/tests/parser/file/test_rst_parser.py
@@ -0,0 +1,284 @@
+import pytest
+from pathlib import Path
+from unittest.mock import patch, mock_open
+
+from application.parser.file.rst_parser import RstParser
+
+
+@pytest.fixture
+def rst_parser():
+    return RstParser()
+
+
+@pytest.fixture
+def rst_parser_custom():
+    return RstParser(
+        remove_hyperlinks=False,
+        remove_images=False,
+        remove_table_excess=False,
+        remove_interpreters=False,
+        remove_directives=False,
+        remove_whitespaces_excess=False,
+        remove_characters_excess=False
+    )
+
+
+def test_rst_init_parser():
+    parser = RstParser()
+    assert isinstance(parser._init_parser(), dict)
+    assert not parser.parser_config_set
+    parser.init_parser()
+    assert parser.parser_config_set
+
+
+def test_rst_parser_initialization_with_custom_options():
+    """Test RstParser initialization with custom options."""
+    parser = RstParser(
+        remove_hyperlinks=False,
+        remove_images=False,
+        remove_table_excess=False,
+        remove_interpreters=False,
+        remove_directives=False,
+        remove_whitespaces_excess=False,
+        remove_characters_excess=False
+    )
+    
+    assert not parser._remove_hyperlinks
+    assert not parser._remove_images
+    assert not parser._remove_table_excess
+    assert not parser._remove_interpreters
+    assert not parser._remove_directives
+    assert not parser._remove_whitespaces_excess
+    assert not parser._remove_characters_excess
+
+
+def test_rst_parser_default_initialization():
+    """Test RstParser initialization with default options."""
+    parser = RstParser()
+    
+    assert parser._remove_hyperlinks
+    assert parser._remove_images
+    assert parser._remove_table_excess
+    assert parser._remove_interpreters
+    assert parser._remove_directives
+    assert parser._remove_whitespaces_excess
+    assert parser._remove_characters_excess
+
+
+def test_remove_hyperlinks():
+    """Test hyperlink removal functionality."""
+    parser = RstParser()
+    content = "This is a `link text <http://example.com>`_ and more text."
+    result = parser.remove_hyperlinks(content)
+    assert result == "This is a link text and more text."
+
+
+def test_remove_images():
+    """Test image removal functionality."""
+    parser = RstParser()
+    content = "Some text\n.. image:: path/to/image.png\nMore text"
+    result = parser.remove_images(content)
+    assert result == "Some text\n\nMore text"
+
+
+def test_remove_directives():
+    """Test directive removal functionality."""
+    parser = RstParser()
+    content = "Text with `..note::` directive and more text"
+    result = parser.remove_directives(content)
+    # The regex pattern looks for `..something::` so it should remove `..note::`
+    assert result == "Text with ` directive and more text"
+
+
+def test_remove_interpreters():
+    """Test interpreter removal functionality."""
+    parser = RstParser()
+    content = "Text with :doc: role and :ref: another role"
+    result = parser.remove_interpreters(content)
+    assert result == "Text with  role and  another role"
+
+
+def test_remove_table_excess():
+    """Test table separator removal functionality."""
+    parser = RstParser()
+    content = "Header\n+-----+-----+\n| A   | B   |\n+-----+-----+\nFooter"
+    result = parser.remove_table_excess(content)
+    assert "+-----+-----+" not in result
+    assert "Header" in result
+    assert "| A   | B   |" in result
+    assert "Footer" in result
+
+
+def test_chunk_by_token_count():
+    """Test token-based chunking functionality."""
+    parser = RstParser()
+    text = "This is a long text that should be chunked into smaller pieces based on token count"
+    chunks = parser.chunk_by_token_count(text, max_tokens=5)
+    
+    # Should create multiple chunks
+    assert len(chunks) > 1
+    
+    # Each chunk should be reasonably sized (approximately 5 * 5 = 25 characters)
+    for chunk in chunks:
+        assert len(chunk) <= 30  # Allow some flexibility
+
+
+def test_rst_to_tups_with_headers():
+    """Test RST to tuples conversion with headers."""
+    parser = RstParser()
+    rst_content = """Introduction
+============
+
+This is the introduction text.
+
+Chapter 1
+=========
+
+This is chapter 1 content.
+More content here.
+
+Chapter 2
+=========
+
+This is chapter 2 content."""
+    
+    tups = parser.rst_to_tups(rst_content)
+    
+    # Should have 3 tuples (intro, chapter 1, chapter 2)
+    assert len(tups) >= 2
+    
+    # Check that headers are captured
+    headers = [tup[0] for tup in tups if tup[0] is not None]
+    assert "Introduction" in headers
+    assert "Chapter 1" in headers
+    assert "Chapter 2" in headers
+
+
+def test_rst_to_tups_without_headers():
+    """Test RST to tuples conversion without headers."""
+    parser = RstParser()
+    rst_content = "Just plain text without any headers or structure."
+    
+    tups = parser.rst_to_tups(rst_content)
+    
+    # Should have one tuple with None header
+    assert len(tups) == 1
+    assert tups[0][0] is None
+    assert "Just plain text" in tups[0][1]
+
+
+def test_parse_file_basic(rst_parser):
+    """Test basic parse_file functionality."""
+    content = """Title
+=====
+
+This is some content.
+
+Subtitle
+--------
+
+More content here."""
+    
+    with patch("builtins.open", mock_open(read_data=content)):
+        result = rst_parser.parse_file(Path("test.rst"))
+    
+    # Should return a list of strings
+    assert isinstance(result, list)
+    assert len(result) >= 1
+    
+    # Content should be processed and cleaned
+    joined_result = "\n".join(result)
+    assert "Title" in joined_result
+    assert "content" in joined_result
+
+
+def test_parse_file_with_hyperlinks(rst_parser_custom):
+    """Test parse_file with hyperlinks when removal is disabled."""
+    content = "Text with `link <http://example.com>`_ here."
+    
+    with patch("builtins.open", mock_open(read_data=content)):
+        result = rst_parser_custom.parse_file(Path("test.rst"))
+    
+    joined_result = "\n".join(result)
+    # Hyperlinks should be preserved when removal is disabled
+    assert "http://example.com" in joined_result
+
+
+def test_parse_tups_with_max_tokens():
+    """Test parse_tups with token chunking."""
+    parser = RstParser()
+    content = """Header
+======
+
+This is a very long piece of content that should be chunked into smaller pieces when max_tokens is specified. It contains multiple sentences and should be split appropriately."""
+    
+    with patch("builtins.open", mock_open(read_data=content)):
+        tups = parser.parse_tups(Path("test.rst"), max_tokens=10)
+    
+    # Should create multiple chunks due to token limit
+    assert len(tups) > 1
+    
+    # Each tuple should have a header indicating chunk number
+    chunk_headers = [tup[0] for tup in tups]
+    assert any("Chunk" in str(header) for header in chunk_headers if header)
+
+
+def test_parse_tups_without_max_tokens():
+    """Test parse_tups without token chunking."""
+    parser = RstParser()
+    content = """Header
+======
+
+Content here."""
+    
+    with patch("builtins.open", mock_open(read_data=content)):
+        tups = parser.parse_tups(Path("test.rst"), max_tokens=None)
+    
+    # Should not create additional chunks
+    assert len(tups) >= 1
+    
+    # Headers should not contain "Chunk"
+    chunk_headers = [tup[0] for tup in tups]
+    assert not any("Chunk" in str(header) for header in chunk_headers if header)
+
+
+def test_parse_file_empty_content():
+    """Test parse_file with empty content."""
+    parser = RstParser()
+    
+    with patch("builtins.open", mock_open(read_data="")):
+        result = parser.parse_file(Path("empty.rst"))
+    
+    # Should handle empty content gracefully
+    assert isinstance(result, list)
+
+
+def test_all_cleaning_methods_applied():
+    """Test that all cleaning methods are applied when enabled."""
+    parser = RstParser()
+    content = """Title
+=====
+
+Text with `link <http://example.com>`_ and :doc:`reference`.
+
+.. image:: image.png
+
+-----+-----+
+| A   | B   |
+-----+-----+
+
+`..note::` This is a note."""
+
+    with patch("builtins.open", mock_open(read_data=content)):
+        result = parser.parse_file(Path("test.rst"))
+
+    joined_result = "\n".join(result)
+
+    # All unwanted elements should be removed
+    assert "http://example.com" not in joined_result  # hyperlinks removed
+    assert ":doc:" not in joined_result  # interpreters removed
+    assert ".. image::" not in joined_result  # images removed
+    assert "+-----+" not in joined_result  # table excess removed
+    # The directive pattern looks for `..something::` so regular .. note:: won't be removed
+    # but `..note::` will be removed
+    assert "`..note::`" not in joined_result  # directives removed