mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 16:43:16 +00:00
61 lines
1.7 KiB
Python
61 lines
1.7 KiB
Python
from pathlib import Path
|
|
from unittest.mock import mock_open, patch
|
|
|
|
import pytest
|
|
import tiktoken
|
|
|
|
from application.parser.file.markdown_parser import MarkdownParser
|
|
|
|
|
|
class _Enc:
|
|
def encode(self, s: str):
|
|
return list(s)
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _patch_tokenizer(monkeypatch):
|
|
monkeypatch.setattr(tiktoken, "get_encoding", lambda _: _Enc())
|
|
|
|
def test_markdown_init_parser():
|
|
parser = MarkdownParser()
|
|
assert isinstance(parser._init_parser(), dict)
|
|
assert not parser.parser_config_set
|
|
parser.init_parser()
|
|
assert parser.parser_config_set
|
|
|
|
|
|
def test_markdown_parse_file_basic_structure():
|
|
content = "# Title\npara1\npara2\n## Sub\ntext\n"
|
|
parser = MarkdownParser()
|
|
with patch("builtins.open", mock_open(read_data=content)):
|
|
result = parser.parse_file(Path("doc.md"))
|
|
assert isinstance(result, list) and len(result) >= 2
|
|
|
|
assert "Title" in result[0]
|
|
assert "para1" in result[0] and "para2" in result[0]
|
|
assert "Sub" in result[1]
|
|
assert "text" in result[1]
|
|
|
|
|
|
def test_markdown_removes_links_and_images_in_parse():
|
|
content = "# T\nSee [link](http://x) and ![[img.png]] here.\n"
|
|
parser = MarkdownParser()
|
|
with patch("builtins.open", mock_open(read_data=content)):
|
|
result = parser.parse_file(Path("doc.md"))
|
|
joined = "\n".join(result)
|
|
assert "(http://x)" not in joined
|
|
assert "![[img.png]]" not in joined
|
|
assert "link" in joined
|
|
|
|
|
|
def test_markdown_token_chunking_via_max_tokens():
|
|
|
|
raw = "abcdefghij" # 10 chars
|
|
parser = MarkdownParser(max_tokens=4)
|
|
with patch("builtins.open", mock_open(read_data=raw)):
|
|
tups = parser.parse_tups(Path("doc.md"))
|
|
assert len(tups) > 1
|
|
for _hdr, chunk in tups:
|
|
assert len(chunk) <= 4
|
|
|