from types import SimpleNamespace
from unittest.mock import MagicMock
import pytest
import requests
from application.parser.remote.crawler_markdown import CrawlerLoader
from application.parser.schema.base import Document
class DummyResponse:
def __init__(self, text):
self.text = text
def raise_for_status(self):
return None
def _fake_extract(value: str) -> SimpleNamespace:
value = value.split("//")[-1]
host = value.split("/")[0]
parts = host.split(".")
if len(parts) >= 2:
domain = parts[-2]
suffix = parts[-1]
else:
domain = host
suffix = ""
return SimpleNamespace(domain=domain, suffix=suffix)
@pytest.fixture(autouse=True)
def _patch_tldextract(monkeypatch):
monkeypatch.setattr(
"application.parser.remote.crawler_markdown.tldextract.extract",
_fake_extract,
)
@pytest.fixture(autouse=True)
def _patch_markdownify(monkeypatch):
outputs = {}
def fake_markdownify(html, *_, **__):
return outputs.get(html, html)
monkeypatch.setattr(
"application.parser.remote.crawler_markdown.markdownify",
fake_markdownify,
)
return outputs
def _setup_session(mock_get_side_effect):
session = MagicMock()
session.get.side_effect = mock_get_side_effect
return session
def test_load_data_filters_external_links(_patch_markdownify):
root_html = """
Home
AboutOtherWelcome
"""
about_html = "AboutAbout page"
_patch_markdownify[root_html] = "Home Markdown"
_patch_markdownify[about_html] = "About Markdown"
responses = {
"http://example.com": DummyResponse(root_html),
"http://example.com/about": DummyResponse(about_html),
}
loader = CrawlerLoader(limit=5)
loader.session = _setup_session(lambda url, timeout=10: responses[url])
docs = loader.load_data("http://example.com")
assert len(docs) == 2
for doc in docs:
assert isinstance(doc, Document)
assert doc.extra_info["source"] in responses
texts = {doc.text for doc in docs}
assert texts == {"Home Markdown", "About Markdown"}
def test_load_data_allows_subdomains(_patch_markdownify):
root_html = """
Home
Blog
"""
blog_html = "BlogBlog post"
_patch_markdownify[root_html] = "Home Markdown"
_patch_markdownify[blog_html] = "Blog Markdown"
responses = {
"http://example.com": DummyResponse(root_html),
"http://blog.example.com/post": DummyResponse(blog_html),
}
loader = CrawlerLoader(limit=5, allow_subdomains=True)
loader.session = _setup_session(lambda url, timeout=10: responses[url])
docs = loader.load_data("http://example.com")
sources = {doc.extra_info["source"] for doc in docs}
assert "http://blog.example.com/post" in sources
assert len(docs) == 2
def test_load_data_handles_fetch_errors(monkeypatch, _patch_markdownify):
root_html = """
Home
About
"""
_patch_markdownify[root_html] = "Home Markdown"
def side_effect(url, timeout=10):
if url == "http://example.com":
return DummyResponse(root_html)
raise requests.exceptions.RequestException("boom")
loader = CrawlerLoader(limit=5)
loader.session = _setup_session(side_effect)
mock_print = MagicMock()
monkeypatch.setattr("builtins.print", mock_print)
docs = loader.load_data("http://example.com")
assert len(docs) == 1
assert docs[0].text == "Home Markdown"
assert mock_print.called