mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 16:43:16 +00:00
(test:files) epub, image, rst
This commit is contained in:
284
tests/parser/file/test_rst_parser.py
Normal file
284
tests/parser/file/test_rst_parser.py
Normal file
@@ -0,0 +1,284 @@
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, mock_open
|
||||
|
||||
from application.parser.file.rst_parser import RstParser
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def rst_parser():
|
||||
return RstParser()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def rst_parser_custom():
|
||||
return RstParser(
|
||||
remove_hyperlinks=False,
|
||||
remove_images=False,
|
||||
remove_table_excess=False,
|
||||
remove_interpreters=False,
|
||||
remove_directives=False,
|
||||
remove_whitespaces_excess=False,
|
||||
remove_characters_excess=False
|
||||
)
|
||||
|
||||
|
||||
def test_rst_init_parser():
|
||||
parser = RstParser()
|
||||
assert isinstance(parser._init_parser(), dict)
|
||||
assert not parser.parser_config_set
|
||||
parser.init_parser()
|
||||
assert parser.parser_config_set
|
||||
|
||||
|
||||
def test_rst_parser_initialization_with_custom_options():
|
||||
"""Test RstParser initialization with custom options."""
|
||||
parser = RstParser(
|
||||
remove_hyperlinks=False,
|
||||
remove_images=False,
|
||||
remove_table_excess=False,
|
||||
remove_interpreters=False,
|
||||
remove_directives=False,
|
||||
remove_whitespaces_excess=False,
|
||||
remove_characters_excess=False
|
||||
)
|
||||
|
||||
assert not parser._remove_hyperlinks
|
||||
assert not parser._remove_images
|
||||
assert not parser._remove_table_excess
|
||||
assert not parser._remove_interpreters
|
||||
assert not parser._remove_directives
|
||||
assert not parser._remove_whitespaces_excess
|
||||
assert not parser._remove_characters_excess
|
||||
|
||||
|
||||
def test_rst_parser_default_initialization():
|
||||
"""Test RstParser initialization with default options."""
|
||||
parser = RstParser()
|
||||
|
||||
assert parser._remove_hyperlinks
|
||||
assert parser._remove_images
|
||||
assert parser._remove_table_excess
|
||||
assert parser._remove_interpreters
|
||||
assert parser._remove_directives
|
||||
assert parser._remove_whitespaces_excess
|
||||
assert parser._remove_characters_excess
|
||||
|
||||
|
||||
def test_remove_hyperlinks():
|
||||
"""Test hyperlink removal functionality."""
|
||||
parser = RstParser()
|
||||
content = "This is a `link text <http://example.com>`_ and more text."
|
||||
result = parser.remove_hyperlinks(content)
|
||||
assert result == "This is a link text and more text."
|
||||
|
||||
|
||||
def test_remove_images():
|
||||
"""Test image removal functionality."""
|
||||
parser = RstParser()
|
||||
content = "Some text\n.. image:: path/to/image.png\nMore text"
|
||||
result = parser.remove_images(content)
|
||||
assert result == "Some text\n\nMore text"
|
||||
|
||||
|
||||
def test_remove_directives():
|
||||
"""Test directive removal functionality."""
|
||||
parser = RstParser()
|
||||
content = "Text with `..note::` directive and more text"
|
||||
result = parser.remove_directives(content)
|
||||
# The regex pattern looks for `..something::` so it should remove `..note::`
|
||||
assert result == "Text with ` directive and more text"
|
||||
|
||||
|
||||
def test_remove_interpreters():
|
||||
"""Test interpreter removal functionality."""
|
||||
parser = RstParser()
|
||||
content = "Text with :doc: role and :ref: another role"
|
||||
result = parser.remove_interpreters(content)
|
||||
assert result == "Text with role and another role"
|
||||
|
||||
|
||||
def test_remove_table_excess():
|
||||
"""Test table separator removal functionality."""
|
||||
parser = RstParser()
|
||||
content = "Header\n+-----+-----+\n| A | B |\n+-----+-----+\nFooter"
|
||||
result = parser.remove_table_excess(content)
|
||||
assert "+-----+-----+" not in result
|
||||
assert "Header" in result
|
||||
assert "| A | B |" in result
|
||||
assert "Footer" in result
|
||||
|
||||
|
||||
def test_chunk_by_token_count():
|
||||
"""Test token-based chunking functionality."""
|
||||
parser = RstParser()
|
||||
text = "This is a long text that should be chunked into smaller pieces based on token count"
|
||||
chunks = parser.chunk_by_token_count(text, max_tokens=5)
|
||||
|
||||
# Should create multiple chunks
|
||||
assert len(chunks) > 1
|
||||
|
||||
# Each chunk should be reasonably sized (approximately 5 * 5 = 25 characters)
|
||||
for chunk in chunks:
|
||||
assert len(chunk) <= 30 # Allow some flexibility
|
||||
|
||||
|
||||
def test_rst_to_tups_with_headers():
|
||||
"""Test RST to tuples conversion with headers."""
|
||||
parser = RstParser()
|
||||
rst_content = """Introduction
|
||||
============
|
||||
|
||||
This is the introduction text.
|
||||
|
||||
Chapter 1
|
||||
=========
|
||||
|
||||
This is chapter 1 content.
|
||||
More content here.
|
||||
|
||||
Chapter 2
|
||||
=========
|
||||
|
||||
This is chapter 2 content."""
|
||||
|
||||
tups = parser.rst_to_tups(rst_content)
|
||||
|
||||
# Should have 3 tuples (intro, chapter 1, chapter 2)
|
||||
assert len(tups) >= 2
|
||||
|
||||
# Check that headers are captured
|
||||
headers = [tup[0] for tup in tups if tup[0] is not None]
|
||||
assert "Introduction" in headers
|
||||
assert "Chapter 1" in headers
|
||||
assert "Chapter 2" in headers
|
||||
|
||||
|
||||
def test_rst_to_tups_without_headers():
|
||||
"""Test RST to tuples conversion without headers."""
|
||||
parser = RstParser()
|
||||
rst_content = "Just plain text without any headers or structure."
|
||||
|
||||
tups = parser.rst_to_tups(rst_content)
|
||||
|
||||
# Should have one tuple with None header
|
||||
assert len(tups) == 1
|
||||
assert tups[0][0] is None
|
||||
assert "Just plain text" in tups[0][1]
|
||||
|
||||
|
||||
def test_parse_file_basic(rst_parser):
|
||||
"""Test basic parse_file functionality."""
|
||||
content = """Title
|
||||
=====
|
||||
|
||||
This is some content.
|
||||
|
||||
Subtitle
|
||||
--------
|
||||
|
||||
More content here."""
|
||||
|
||||
with patch("builtins.open", mock_open(read_data=content)):
|
||||
result = rst_parser.parse_file(Path("test.rst"))
|
||||
|
||||
# Should return a list of strings
|
||||
assert isinstance(result, list)
|
||||
assert len(result) >= 1
|
||||
|
||||
# Content should be processed and cleaned
|
||||
joined_result = "\n".join(result)
|
||||
assert "Title" in joined_result
|
||||
assert "content" in joined_result
|
||||
|
||||
|
||||
def test_parse_file_with_hyperlinks(rst_parser_custom):
|
||||
"""Test parse_file with hyperlinks when removal is disabled."""
|
||||
content = "Text with `link <http://example.com>`_ here."
|
||||
|
||||
with patch("builtins.open", mock_open(read_data=content)):
|
||||
result = rst_parser_custom.parse_file(Path("test.rst"))
|
||||
|
||||
joined_result = "\n".join(result)
|
||||
# Hyperlinks should be preserved when removal is disabled
|
||||
assert "http://example.com" in joined_result
|
||||
|
||||
|
||||
def test_parse_tups_with_max_tokens():
|
||||
"""Test parse_tups with token chunking."""
|
||||
parser = RstParser()
|
||||
content = """Header
|
||||
======
|
||||
|
||||
This is a very long piece of content that should be chunked into smaller pieces when max_tokens is specified. It contains multiple sentences and should be split appropriately."""
|
||||
|
||||
with patch("builtins.open", mock_open(read_data=content)):
|
||||
tups = parser.parse_tups(Path("test.rst"), max_tokens=10)
|
||||
|
||||
# Should create multiple chunks due to token limit
|
||||
assert len(tups) > 1
|
||||
|
||||
# Each tuple should have a header indicating chunk number
|
||||
chunk_headers = [tup[0] for tup in tups]
|
||||
assert any("Chunk" in str(header) for header in chunk_headers if header)
|
||||
|
||||
|
||||
def test_parse_tups_without_max_tokens():
|
||||
"""Test parse_tups without token chunking."""
|
||||
parser = RstParser()
|
||||
content = """Header
|
||||
======
|
||||
|
||||
Content here."""
|
||||
|
||||
with patch("builtins.open", mock_open(read_data=content)):
|
||||
tups = parser.parse_tups(Path("test.rst"), max_tokens=None)
|
||||
|
||||
# Should not create additional chunks
|
||||
assert len(tups) >= 1
|
||||
|
||||
# Headers should not contain "Chunk"
|
||||
chunk_headers = [tup[0] for tup in tups]
|
||||
assert not any("Chunk" in str(header) for header in chunk_headers if header)
|
||||
|
||||
|
||||
def test_parse_file_empty_content():
|
||||
"""Test parse_file with empty content."""
|
||||
parser = RstParser()
|
||||
|
||||
with patch("builtins.open", mock_open(read_data="")):
|
||||
result = parser.parse_file(Path("empty.rst"))
|
||||
|
||||
# Should handle empty content gracefully
|
||||
assert isinstance(result, list)
|
||||
|
||||
|
||||
def test_all_cleaning_methods_applied():
|
||||
"""Test that all cleaning methods are applied when enabled."""
|
||||
parser = RstParser()
|
||||
content = """Title
|
||||
=====
|
||||
|
||||
Text with `link <http://example.com>`_ and :doc:`reference`.
|
||||
|
||||
.. image:: image.png
|
||||
|
||||
+-----+-----+
|
||||
| A | B |
|
||||
+-----+-----+
|
||||
|
||||
`..note::` This is a note."""
|
||||
|
||||
with patch("builtins.open", mock_open(read_data=content)):
|
||||
result = parser.parse_file(Path("test.rst"))
|
||||
|
||||
joined_result = "\n".join(result)
|
||||
|
||||
# All unwanted elements should be removed
|
||||
assert "http://example.com" not in joined_result # hyperlinks removed
|
||||
assert ":doc:" not in joined_result # interpreters removed
|
||||
assert ".. image::" not in joined_result # images removed
|
||||
assert "+-----+" not in joined_result # table excess removed
|
||||
# The directive pattern looks for `..something::` so regular .. note:: won't be removed
|
||||
# but `..note::` will be removed
|
||||
assert "`..note::`" not in joined_result # directives removed
|
||||
Reference in New Issue
Block a user