import pytest from pathlib import Path from unittest.mock import patch, mock_open from application.parser.file.rst_parser import RstParser @pytest.fixture def rst_parser(): return RstParser() @pytest.fixture def rst_parser_custom(): return RstParser( remove_hyperlinks=False, remove_images=False, remove_table_excess=False, remove_interpreters=False, remove_directives=False, remove_whitespaces_excess=False, remove_characters_excess=False ) def test_rst_init_parser(): parser = RstParser() assert isinstance(parser._init_parser(), dict) assert not parser.parser_config_set parser.init_parser() assert parser.parser_config_set def test_rst_parser_initialization_with_custom_options(): """Test RstParser initialization with custom options.""" parser = RstParser( remove_hyperlinks=False, remove_images=False, remove_table_excess=False, remove_interpreters=False, remove_directives=False, remove_whitespaces_excess=False, remove_characters_excess=False ) assert not parser._remove_hyperlinks assert not parser._remove_images assert not parser._remove_table_excess assert not parser._remove_interpreters assert not parser._remove_directives assert not parser._remove_whitespaces_excess assert not parser._remove_characters_excess def test_rst_parser_default_initialization(): """Test RstParser initialization with default options.""" parser = RstParser() assert parser._remove_hyperlinks assert parser._remove_images assert parser._remove_table_excess assert parser._remove_interpreters assert parser._remove_directives assert parser._remove_whitespaces_excess assert parser._remove_characters_excess def test_remove_hyperlinks(): """Test hyperlink removal functionality.""" parser = RstParser() content = "This is a `link text `_ and more text." result = parser.remove_hyperlinks(content) assert result == "This is a link text and more text." def test_remove_images(): """Test image removal functionality.""" parser = RstParser() content = "Some text\n.. image:: path/to/image.png\nMore text" result = parser.remove_images(content) assert result == "Some text\n\nMore text" def test_remove_directives(): """Test directive removal functionality.""" parser = RstParser() content = "Text with `..note::` directive and more text" result = parser.remove_directives(content) # The regex pattern looks for `..something::` so it should remove `..note::` assert result == "Text with ` directive and more text" def test_remove_interpreters(): """Test interpreter removal functionality.""" parser = RstParser() content = "Text with :doc: role and :ref: another role" result = parser.remove_interpreters(content) assert result == "Text with role and another role" def test_remove_table_excess(): """Test table separator removal functionality.""" parser = RstParser() content = "Header\n+-----+-----+\n| A | B |\n+-----+-----+\nFooter" result = parser.remove_table_excess(content) assert "+-----+-----+" not in result assert "Header" in result assert "| A | B |" in result assert "Footer" in result def test_chunk_by_token_count(): """Test token-based chunking functionality.""" parser = RstParser() text = "This is a long text that should be chunked into smaller pieces based on token count" chunks = parser.chunk_by_token_count(text, max_tokens=5) # Should create multiple chunks assert len(chunks) > 1 # Each chunk should be reasonably sized (approximately 5 * 5 = 25 characters) for chunk in chunks: assert len(chunk) <= 30 # Allow some flexibility def test_rst_to_tups_with_headers(): """Test RST to tuples conversion with headers.""" parser = RstParser() rst_content = """Introduction ============ This is the introduction text. Chapter 1 ========= This is chapter 1 content. More content here. Chapter 2 ========= This is chapter 2 content.""" tups = parser.rst_to_tups(rst_content) # Should have 3 tuples (intro, chapter 1, chapter 2) assert len(tups) >= 2 # Check that headers are captured headers = [tup[0] for tup in tups if tup[0] is not None] assert "Introduction" in headers assert "Chapter 1" in headers assert "Chapter 2" in headers def test_rst_to_tups_without_headers(): """Test RST to tuples conversion without headers.""" parser = RstParser() rst_content = "Just plain text without any headers or structure." tups = parser.rst_to_tups(rst_content) # Should have one tuple with None header assert len(tups) == 1 assert tups[0][0] is None assert "Just plain text" in tups[0][1] def test_parse_file_basic(rst_parser): """Test basic parse_file functionality.""" content = """Title ===== This is some content. Subtitle -------- More content here.""" with patch("builtins.open", mock_open(read_data=content)): result = rst_parser.parse_file(Path("test.rst")) # Should return a list of strings assert isinstance(result, list) assert len(result) >= 1 # Content should be processed and cleaned joined_result = "\n".join(result) assert "Title" in joined_result assert "content" in joined_result def test_parse_file_with_hyperlinks(rst_parser_custom): """Test parse_file with hyperlinks when removal is disabled.""" content = "Text with `link `_ here." with patch("builtins.open", mock_open(read_data=content)): result = rst_parser_custom.parse_file(Path("test.rst")) joined_result = "\n".join(result) # Hyperlinks should be preserved when removal is disabled assert "http://example.com" in joined_result def test_parse_tups_with_max_tokens(): """Test parse_tups with token chunking.""" parser = RstParser() content = """Header ====== This is a very long piece of content that should be chunked into smaller pieces when max_tokens is specified. It contains multiple sentences and should be split appropriately.""" with patch("builtins.open", mock_open(read_data=content)): tups = parser.parse_tups(Path("test.rst"), max_tokens=10) # Should create multiple chunks due to token limit assert len(tups) > 1 # Each tuple should have a header indicating chunk number chunk_headers = [tup[0] for tup in tups] assert any("Chunk" in str(header) for header in chunk_headers if header) def test_parse_tups_without_max_tokens(): """Test parse_tups without token chunking.""" parser = RstParser() content = """Header ====== Content here.""" with patch("builtins.open", mock_open(read_data=content)): tups = parser.parse_tups(Path("test.rst"), max_tokens=None) # Should not create additional chunks assert len(tups) >= 1 # Headers should not contain "Chunk" chunk_headers = [tup[0] for tup in tups] assert not any("Chunk" in str(header) for header in chunk_headers if header) def test_parse_file_empty_content(): """Test parse_file with empty content.""" parser = RstParser() with patch("builtins.open", mock_open(read_data="")): result = parser.parse_file(Path("empty.rst")) # Should handle empty content gracefully assert isinstance(result, list) def test_all_cleaning_methods_applied(): """Test that all cleaning methods are applied when enabled.""" parser = RstParser() content = """Title ===== Text with `link `_ and :doc:`reference`. .. image:: image.png +-----+-----+ | A | B | +-----+-----+ `..note::` This is a note.""" with patch("builtins.open", mock_open(read_data=content)): result = parser.parse_file(Path("test.rst")) joined_result = "\n".join(result) # All unwanted elements should be removed assert "http://example.com" not in joined_result # hyperlinks removed assert ":doc:" not in joined_result # interpreters removed assert ".. image::" not in joined_result # images removed assert "+-----+" not in joined_result # table excess removed # The directive pattern looks for `..something::` so regular .. note:: won't be removed # but `..note::` will be removed assert "`..note::`" not in joined_result # directives removed