(test:files) html, md, json

This commit is contained in:
ManishMadan2882
2025-09-29 17:23:15 +05:30
parent ba496a772b
commit 282bd35f52
3 changed files with 156 additions and 0 deletions

View File

@@ -0,0 +1,44 @@
import pytest
from pathlib import Path
from unittest.mock import patch, MagicMock
import sys
import types
from application.parser.file.html_parser import HTMLParser
@pytest.fixture
def html_parser():
return HTMLParser()
def test_html_init_parser():
parser = HTMLParser()
assert isinstance(parser._init_parser(), dict)
assert not parser.parser_config_set
parser.init_parser()
assert parser.parser_config_set
def test_html_parser_parse_file():
parser = HTMLParser()
mock_doc = MagicMock()
mock_doc.page_content = "Extracted HTML content"
mock_doc.metadata = {"source": "test.html"}
import types, sys
fake_lc = types.ModuleType("langchain_community")
fake_dl = types.ModuleType("langchain_community.document_loaders")
bshtml_mock = MagicMock(return_value=MagicMock(load=MagicMock(return_value=[mock_doc])))
fake_dl.BSHTMLLoader = bshtml_mock
fake_lc.document_loaders = fake_dl
with patch.dict(sys.modules, {
"langchain_community": fake_lc,
"langchain_community.document_loaders": fake_dl,
}):
result = parser.parse_file(Path("test.html"))
assert result == [mock_doc]
bshtml_mock.assert_called_once_with(Path("test.html"))

View File

@@ -0,0 +1,49 @@
import pytest
from pathlib import Path
from unittest.mock import patch, mock_open
from application.parser.file.json_parser import JSONParser
def test_json_init_parser():
parser = JSONParser()
assert isinstance(parser._init_parser(), dict)
assert not parser.parser_config_set
parser.init_parser()
assert parser.parser_config_set
def test_json_parser_parses_dict_concat():
parser = JSONParser()
with patch("builtins.open", mock_open(read_data="{}")):
with patch("json.load", return_value={"a": 1}):
result = parser.parse_file(Path("t.json"))
assert result == "{'a': 1}"
def test_json_parser_parses_list_no_concat():
parser = JSONParser()
parser._concat_rows = False
data = [{"a": 1}, {"b": 2}]
with patch("builtins.open", mock_open(read_data="[]")):
with patch("json.load", return_value=data):
result = parser.parse_file(Path("t.json"))
assert result == data
def test_json_parser_row_joiner_config():
parser = JSONParser(row_joiner=" || ")
with patch("builtins.open", mock_open(read_data="[]")):
with patch("json.load", return_value=[{"a": 1}, {"b": 2}]):
result = parser.parse_file(Path("t.json"))
assert result == "{'a': 1} || {'b': 2}"
def test_json_parser_forwards_json_config():
pf = lambda s: 1.23
parser = JSONParser(json_config={"parse_float": pf})
with patch("builtins.open", mock_open(read_data="[]")):
with patch("json.load", return_value=[]) as mock_load:
parser.parse_file(Path("t.json"))
assert mock_load.call_args.kwargs.get("parse_float") is pf

View File

@@ -0,0 +1,63 @@
from pathlib import Path
from unittest.mock import mock_open, patch
import sys, types
if "tiktoken" not in sys.modules:
fake_tt = types.ModuleType("tiktoken")
class _Enc:
def encode(self, s: str):
return list(s)
def get_encoding(_: str):
return _Enc()
fake_tt.get_encoding = get_encoding
sys.modules["tiktoken"] = fake_tt
import tiktoken
from application.parser.file.markdown_parser import MarkdownParser
def test_markdown_init_parser():
parser = MarkdownParser()
assert isinstance(parser._init_parser(), dict)
assert not parser.parser_config_set
parser.init_parser()
assert parser.parser_config_set
def test_markdown_parse_file_basic_structure():
content = "# Title\npara1\npara2\n## Sub\ntext\n"
parser = MarkdownParser()
with patch("builtins.open", mock_open(read_data=content)):
result = parser.parse_file(Path("doc.md"))
assert isinstance(result, list) and len(result) >= 2
assert "Title" in result[0]
assert "para1" in result[0] and "para2" in result[0]
assert "Sub" in result[1]
assert "text" in result[1]
def test_markdown_removes_links_and_images_in_parse():
content = "# T\nSee [link](http://x) and ![[img.png]] here.\n"
parser = MarkdownParser()
with patch("builtins.open", mock_open(read_data=content)):
result = parser.parse_file(Path("doc.md"))
joined = "\n".join(result)
assert "(http://x)" not in joined
assert "![[img.png]]" not in joined
assert "link" in joined
def test_markdown_token_chunking_via_max_tokens():
raw = "abcdefghij" # 10 chars
parser = MarkdownParser(max_tokens=4)
with patch("builtins.open", mock_open(read_data=raw)):
tups = parser.parse_tups(Path("doc.md"))
assert len(tups) > 1
for _hdr, chunk in tups:
assert len(chunk) <= 4