diff --git a/tests/parser/file/test_epub_parser.py b/tests/parser/file/test_epub_parser.py new file mode 100644 index 00000000..519ae2a4 --- /dev/null +++ b/tests/parser/file/test_epub_parser.py @@ -0,0 +1,152 @@ +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock +import sys +import types + +from application.parser.file.epub_parser import EpubParser + + +@pytest.fixture +def epub_parser(): + return EpubParser() + + +def test_epub_init_parser(): + parser = EpubParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +def test_epub_parser_ebooklib_import_error(epub_parser): + """Test that ImportError is raised when ebooklib is not available.""" + with patch.dict(sys.modules, {"ebooklib": None}): + with pytest.raises(ValueError, match="`EbookLib` is required to read Epub files"): + epub_parser.parse_file(Path("test.epub")) + + +def test_epub_parser_html2text_import_error(epub_parser): + """Test that ImportError is raised when html2text is not available.""" + fake_ebooklib = types.ModuleType("ebooklib") + fake_epub = types.ModuleType("ebooklib.epub") + fake_ebooklib.epub = fake_epub + + with patch.dict(sys.modules, {"ebooklib": fake_ebooklib, "ebooklib.epub": fake_epub}): + with patch.dict(sys.modules, {"html2text": None}): + with pytest.raises(ValueError, match="`html2text` is required to parse Epub files"): + epub_parser.parse_file(Path("test.epub")) + + +def test_epub_parser_successful_parsing(epub_parser): + """Test successful parsing of an epub file.""" + + fake_ebooklib = types.ModuleType("ebooklib") + fake_epub = types.ModuleType("ebooklib.epub") + fake_html2text = types.ModuleType("html2text") + + # Mock ebooklib constants + fake_ebooklib.ITEM_DOCUMENT = "document" + fake_ebooklib.epub = fake_epub + + mock_item1 = MagicMock() + mock_item1.get_type.return_value = "document" + mock_item1.get_content.return_value = b"
Content 1
" + + mock_item2 = MagicMock() + mock_item2.get_type.return_value = "document" + mock_item2.get_content.return_value = b"Content 2
" + + mock_item3 = MagicMock() + mock_item3.get_type.return_value = "other" # Should be ignored + mock_item3.get_content.return_value = b"Other content
" + + mock_book = MagicMock() + mock_book.get_items.return_value = [mock_item1, mock_item2, mock_item3] + + fake_epub.read_epub = MagicMock(return_value=mock_book) + + def mock_html2text_func(html_content): + if "Chapter 1" in html_content: + return "# Chapter 1\n\nContent 1\n" + elif "Chapter 2" in html_content: + return "# Chapter 2\n\nContent 2\n" + return "Other content\n" + + fake_html2text.html2text = mock_html2text_func + + with patch.dict(sys.modules, { + "ebooklib": fake_ebooklib, + "ebooklib.epub": fake_epub, + "html2text": fake_html2text + }): + result = epub_parser.parse_file(Path("test.epub")) + + expected_result = "# Chapter 1\n\nContent 1\n\n# Chapter 2\n\nContent 2\n" + assert result == expected_result + + # Verify epub.read_epub was called with correct parameters + fake_epub.read_epub.assert_called_once_with(Path("test.epub"), options={"ignore_ncx": True}) + + +def test_epub_parser_empty_book(epub_parser): + """Test parsing an epub file with no document items.""" + # Create mock modules + fake_ebooklib = types.ModuleType("ebooklib") + fake_epub = types.ModuleType("ebooklib.epub") + fake_html2text = types.ModuleType("html2text") + + fake_ebooklib.ITEM_DOCUMENT = "document" + fake_ebooklib.epub = fake_epub + + # Create mock book with no document items + mock_book = MagicMock() + mock_book.get_items.return_value = [] + + fake_epub.read_epub = MagicMock(return_value=mock_book) + fake_html2text.html2text = MagicMock() + + with patch.dict(sys.modules, { + "ebooklib": fake_ebooklib, + "ebooklib.epub": fake_epub, + "html2text": fake_html2text + }): + result = epub_parser.parse_file(Path("empty.epub")) + assert result == "" + + fake_html2text.html2text.assert_not_called() + + +def test_epub_parser_non_document_items_ignored(epub_parser): + """Test that non-document items are ignored during parsing.""" + fake_ebooklib = types.ModuleType("ebooklib") + fake_epub = types.ModuleType("ebooklib.epub") + fake_html2text = types.ModuleType("html2text") + + fake_ebooklib.ITEM_DOCUMENT = "document" + fake_ebooklib.epub = fake_epub + + mock_doc_item = MagicMock() + mock_doc_item.get_type.return_value = "document" + mock_doc_item.get_content.return_value = b"Document content
" + + mock_other_item = MagicMock() + mock_other_item.get_type.return_value = "image" # Not a document + + mock_book = MagicMock() + mock_book.get_items.return_value = [mock_other_item, mock_doc_item] + + fake_epub.read_epub = MagicMock(return_value=mock_book) + fake_html2text.html2text = MagicMock(return_value="Document content\n") + + with patch.dict(sys.modules, { + "ebooklib": fake_ebooklib, + "ebooklib.epub": fake_epub, + "html2text": fake_html2text + }): + result = epub_parser.parse_file(Path("test.epub")) + + assert result == "Document content\n" + + fake_html2text.html2text.assert_called_once_with("Document content
") diff --git a/tests/parser/file/test_image_parser.py b/tests/parser/file/test_image_parser.py new file mode 100644 index 00000000..c5df48b4 --- /dev/null +++ b/tests/parser/file/test_image_parser.py @@ -0,0 +1,42 @@ +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock, mock_open + +from application.parser.file.image_parser import ImageParser + + +def test_image_init_parser(): + parser = ImageParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +@patch("application.parser.file.image_parser.settings") +def test_image_parser_remote_true(mock_settings): + mock_settings.PARSE_IMAGE_REMOTE = True + parser = ImageParser() + + mock_response = MagicMock() + mock_response.json.return_value = {"markdown": "# From Image"} + + with patch("application.parser.file.image_parser.requests.post", return_value=mock_response) as mock_post: + with patch("builtins.open", mock_open()): + result = parser.parse_file(Path("img.png")) + + assert result == "# From Image" + mock_post.assert_called_once() + + +@patch("application.parser.file.image_parser.settings") +def test_image_parser_remote_false(mock_settings): + mock_settings.PARSE_IMAGE_REMOTE = False + parser = ImageParser() + + with patch("application.parser.file.image_parser.requests.post") as mock_post: + result = parser.parse_file(Path("img.png")) + + assert result == "" + mock_post.assert_not_called() + diff --git a/tests/parser/file/test_rst_parser.py b/tests/parser/file/test_rst_parser.py new file mode 100644 index 00000000..b7466234 --- /dev/null +++ b/tests/parser/file/test_rst_parser.py @@ -0,0 +1,284 @@ +import pytest +from pathlib import Path +from unittest.mock import patch, mock_open + +from application.parser.file.rst_parser import RstParser + + +@pytest.fixture +def rst_parser(): + return RstParser() + + +@pytest.fixture +def rst_parser_custom(): + return RstParser( + remove_hyperlinks=False, + remove_images=False, + remove_table_excess=False, + remove_interpreters=False, + remove_directives=False, + remove_whitespaces_excess=False, + remove_characters_excess=False + ) + + +def test_rst_init_parser(): + parser = RstParser() + assert isinstance(parser._init_parser(), dict) + assert not parser.parser_config_set + parser.init_parser() + assert parser.parser_config_set + + +def test_rst_parser_initialization_with_custom_options(): + """Test RstParser initialization with custom options.""" + parser = RstParser( + remove_hyperlinks=False, + remove_images=False, + remove_table_excess=False, + remove_interpreters=False, + remove_directives=False, + remove_whitespaces_excess=False, + remove_characters_excess=False + ) + + assert not parser._remove_hyperlinks + assert not parser._remove_images + assert not parser._remove_table_excess + assert not parser._remove_interpreters + assert not parser._remove_directives + assert not parser._remove_whitespaces_excess + assert not parser._remove_characters_excess + + +def test_rst_parser_default_initialization(): + """Test RstParser initialization with default options.""" + parser = RstParser() + + assert parser._remove_hyperlinks + assert parser._remove_images + assert parser._remove_table_excess + assert parser._remove_interpreters + assert parser._remove_directives + assert parser._remove_whitespaces_excess + assert parser._remove_characters_excess + + +def test_remove_hyperlinks(): + """Test hyperlink removal functionality.""" + parser = RstParser() + content = "This is a `link text