mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 00:23:17 +00:00
(test:files) html, md, json
This commit is contained in:
44
tests/parser/file/test_html_parser.py
Normal file
44
tests/parser/file/test_html_parser.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import sys
|
||||
import types
|
||||
|
||||
from application.parser.file.html_parser import HTMLParser
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def html_parser():
|
||||
return HTMLParser()
|
||||
|
||||
|
||||
def test_html_init_parser():
|
||||
parser = HTMLParser()
|
||||
assert isinstance(parser._init_parser(), dict)
|
||||
assert not parser.parser_config_set
|
||||
parser.init_parser()
|
||||
assert parser.parser_config_set
|
||||
|
||||
|
||||
def test_html_parser_parse_file():
|
||||
parser = HTMLParser()
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.page_content = "Extracted HTML content"
|
||||
mock_doc.metadata = {"source": "test.html"}
|
||||
|
||||
import types, sys
|
||||
fake_lc = types.ModuleType("langchain_community")
|
||||
fake_dl = types.ModuleType("langchain_community.document_loaders")
|
||||
|
||||
bshtml_mock = MagicMock(return_value=MagicMock(load=MagicMock(return_value=[mock_doc])))
|
||||
fake_dl.BSHTMLLoader = bshtml_mock
|
||||
fake_lc.document_loaders = fake_dl
|
||||
|
||||
with patch.dict(sys.modules, {
|
||||
"langchain_community": fake_lc,
|
||||
"langchain_community.document_loaders": fake_dl,
|
||||
}):
|
||||
result = parser.parse_file(Path("test.html"))
|
||||
assert result == [mock_doc]
|
||||
bshtml_mock.assert_called_once_with(Path("test.html"))
|
||||
49
tests/parser/file/test_json_parser.py
Normal file
49
tests/parser/file/test_json_parser.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, mock_open
|
||||
|
||||
from application.parser.file.json_parser import JSONParser
|
||||
|
||||
|
||||
def test_json_init_parser():
|
||||
parser = JSONParser()
|
||||
assert isinstance(parser._init_parser(), dict)
|
||||
assert not parser.parser_config_set
|
||||
parser.init_parser()
|
||||
assert parser.parser_config_set
|
||||
|
||||
|
||||
def test_json_parser_parses_dict_concat():
|
||||
parser = JSONParser()
|
||||
with patch("builtins.open", mock_open(read_data="{}")):
|
||||
with patch("json.load", return_value={"a": 1}):
|
||||
result = parser.parse_file(Path("t.json"))
|
||||
assert result == "{'a': 1}"
|
||||
|
||||
|
||||
def test_json_parser_parses_list_no_concat():
|
||||
parser = JSONParser()
|
||||
parser._concat_rows = False
|
||||
data = [{"a": 1}, {"b": 2}]
|
||||
with patch("builtins.open", mock_open(read_data="[]")):
|
||||
with patch("json.load", return_value=data):
|
||||
result = parser.parse_file(Path("t.json"))
|
||||
assert result == data
|
||||
|
||||
|
||||
def test_json_parser_row_joiner_config():
|
||||
parser = JSONParser(row_joiner=" || ")
|
||||
with patch("builtins.open", mock_open(read_data="[]")):
|
||||
with patch("json.load", return_value=[{"a": 1}, {"b": 2}]):
|
||||
result = parser.parse_file(Path("t.json"))
|
||||
assert result == "{'a': 1} || {'b': 2}"
|
||||
|
||||
|
||||
def test_json_parser_forwards_json_config():
|
||||
pf = lambda s: 1.23
|
||||
parser = JSONParser(json_config={"parse_float": pf})
|
||||
with patch("builtins.open", mock_open(read_data="[]")):
|
||||
with patch("json.load", return_value=[]) as mock_load:
|
||||
parser.parse_file(Path("t.json"))
|
||||
assert mock_load.call_args.kwargs.get("parse_float") is pf
|
||||
|
||||
63
tests/parser/file/test_markdown_parser.py
Normal file
63
tests/parser/file/test_markdown_parser.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from pathlib import Path
|
||||
from unittest.mock import mock_open, patch
|
||||
|
||||
import sys, types
|
||||
if "tiktoken" not in sys.modules:
|
||||
fake_tt = types.ModuleType("tiktoken")
|
||||
|
||||
class _Enc:
|
||||
def encode(self, s: str):
|
||||
return list(s)
|
||||
|
||||
def get_encoding(_: str):
|
||||
return _Enc()
|
||||
|
||||
fake_tt.get_encoding = get_encoding
|
||||
sys.modules["tiktoken"] = fake_tt
|
||||
|
||||
import tiktoken
|
||||
|
||||
from application.parser.file.markdown_parser import MarkdownParser
|
||||
|
||||
def test_markdown_init_parser():
|
||||
parser = MarkdownParser()
|
||||
assert isinstance(parser._init_parser(), dict)
|
||||
assert not parser.parser_config_set
|
||||
parser.init_parser()
|
||||
assert parser.parser_config_set
|
||||
|
||||
|
||||
def test_markdown_parse_file_basic_structure():
|
||||
content = "# Title\npara1\npara2\n## Sub\ntext\n"
|
||||
parser = MarkdownParser()
|
||||
with patch("builtins.open", mock_open(read_data=content)):
|
||||
result = parser.parse_file(Path("doc.md"))
|
||||
assert isinstance(result, list) and len(result) >= 2
|
||||
|
||||
assert "Title" in result[0]
|
||||
assert "para1" in result[0] and "para2" in result[0]
|
||||
assert "Sub" in result[1]
|
||||
assert "text" in result[1]
|
||||
|
||||
|
||||
def test_markdown_removes_links_and_images_in_parse():
|
||||
content = "# T\nSee [link](http://x) and ![[img.png]] here.\n"
|
||||
parser = MarkdownParser()
|
||||
with patch("builtins.open", mock_open(read_data=content)):
|
||||
result = parser.parse_file(Path("doc.md"))
|
||||
joined = "\n".join(result)
|
||||
assert "(http://x)" not in joined
|
||||
assert "![[img.png]]" not in joined
|
||||
assert "link" in joined
|
||||
|
||||
|
||||
def test_markdown_token_chunking_via_max_tokens():
|
||||
|
||||
raw = "abcdefghij" # 10 chars
|
||||
parser = MarkdownParser(max_tokens=4)
|
||||
with patch("builtins.open", mock_open(read_data=raw)):
|
||||
tups = parser.parse_tups(Path("doc.md"))
|
||||
assert len(tups) > 1
|
||||
for _hdr, chunk in tups:
|
||||
assert len(chunk) <= 4
|
||||
|
||||
Reference in New Issue
Block a user