mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 16:43:16 +00:00
168 lines
5.6 KiB
Python
168 lines
5.6 KiB
Python
from unittest.mock import MagicMock, patch
|
|
|
|
from application.parser.remote.crawler_loader import CrawlerLoader
|
|
from application.parser.schema.base import Document
|
|
from langchain.docstore.document import Document as LCDocument
|
|
|
|
|
|
class DummyResponse:
|
|
def __init__(self, text: str) -> None:
|
|
self.text = text
|
|
|
|
def raise_for_status(self) -> None:
|
|
return None
|
|
|
|
|
|
@patch("application.parser.remote.crawler_loader.requests.get")
|
|
def test_load_data_crawls_same_domain_links(mock_requests_get):
|
|
responses = {
|
|
"http://example.com": DummyResponse(
|
|
"""
|
|
<html>
|
|
<body>
|
|
<a href='/about'>About</a>
|
|
<a href='https://external.com/news'>External</a>
|
|
</body>
|
|
</html>
|
|
"""
|
|
),
|
|
"http://example.com/about": DummyResponse("<html><body>About page</body></html>"),
|
|
}
|
|
|
|
def response_side_effect(url: str):
|
|
if url not in responses:
|
|
raise AssertionError(f"Unexpected request for URL: {url}")
|
|
return responses[url]
|
|
|
|
mock_requests_get.side_effect = response_side_effect
|
|
|
|
root_doc = MagicMock(spec=LCDocument)
|
|
root_doc.page_content = "Root content"
|
|
root_doc.metadata = {"source": "http://example.com"}
|
|
|
|
about_doc = MagicMock(spec=LCDocument)
|
|
about_doc.page_content = "About content"
|
|
about_doc.metadata = {"source": "http://example.com/about"}
|
|
|
|
loader_instances = {
|
|
"http://example.com": MagicMock(),
|
|
"http://example.com/about": MagicMock(),
|
|
}
|
|
loader_instances["http://example.com"].load.return_value = [root_doc]
|
|
loader_instances["http://example.com/about"].load.return_value = [about_doc]
|
|
|
|
loader_call_order = []
|
|
|
|
def loader_factory(url_list):
|
|
url = url_list[0]
|
|
loader_call_order.append(url)
|
|
return loader_instances[url]
|
|
|
|
crawler = CrawlerLoader(limit=5)
|
|
crawler.loader = MagicMock(side_effect=loader_factory)
|
|
|
|
result = crawler.load_data("http://example.com")
|
|
|
|
assert len(result) == 2
|
|
assert all(isinstance(doc, Document) for doc in result)
|
|
|
|
sources = {doc.extra_info.get("source") for doc in result}
|
|
assert sources == {"http://example.com", "http://example.com/about"}
|
|
|
|
texts = {doc.text for doc in result}
|
|
assert texts == {"Root content", "About content"}
|
|
|
|
assert mock_requests_get.call_count == 2
|
|
assert loader_call_order == ["http://example.com", "http://example.com/about"]
|
|
|
|
|
|
@patch("application.parser.remote.crawler_loader.requests.get")
|
|
def test_load_data_accepts_list_input_and_adds_scheme(mock_requests_get):
|
|
mock_requests_get.return_value = DummyResponse("<html><body>No links here</body></html>")
|
|
|
|
doc = MagicMock(spec=LCDocument)
|
|
doc.page_content = "Homepage"
|
|
doc.metadata = {"source": "http://example.com"}
|
|
|
|
loader_instance = MagicMock()
|
|
loader_instance.load.return_value = [doc]
|
|
|
|
crawler = CrawlerLoader()
|
|
crawler.loader = MagicMock(return_value=loader_instance)
|
|
|
|
result = crawler.load_data(["example.com", "unused.com"])
|
|
|
|
mock_requests_get.assert_called_once_with("http://example.com")
|
|
crawler.loader.assert_called_once_with(["http://example.com"])
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "Homepage"
|
|
assert result[0].extra_info == {"source": "http://example.com"}
|
|
|
|
|
|
@patch("application.parser.remote.crawler_loader.requests.get")
|
|
def test_load_data_respects_limit(mock_requests_get):
|
|
responses = {
|
|
"http://example.com": DummyResponse(
|
|
"""
|
|
<html>
|
|
<body>
|
|
<a href='/about'>About</a>
|
|
</body>
|
|
</html>
|
|
"""
|
|
),
|
|
"http://example.com/about": DummyResponse("<html><body>About</body></html>"),
|
|
}
|
|
|
|
mock_requests_get.side_effect = lambda url: responses[url]
|
|
|
|
root_doc = MagicMock(spec=LCDocument)
|
|
root_doc.page_content = "Root content"
|
|
root_doc.metadata = {"source": "http://example.com"}
|
|
|
|
about_doc = MagicMock(spec=LCDocument)
|
|
about_doc.page_content = "About content"
|
|
about_doc.metadata = {"source": "http://example.com/about"}
|
|
|
|
loader_instances = {
|
|
"http://example.com": MagicMock(),
|
|
"http://example.com/about": MagicMock(),
|
|
}
|
|
loader_instances["http://example.com"].load.return_value = [root_doc]
|
|
loader_instances["http://example.com/about"].load.return_value = [about_doc]
|
|
|
|
crawler = CrawlerLoader(limit=1)
|
|
crawler.loader = MagicMock(side_effect=lambda url_list: loader_instances[url_list[0]])
|
|
|
|
result = crawler.load_data("http://example.com")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "Root content"
|
|
assert mock_requests_get.call_count == 1
|
|
assert crawler.loader.call_count == 1
|
|
|
|
|
|
@patch("application.parser.remote.crawler_loader.logging")
|
|
@patch("application.parser.remote.crawler_loader.requests.get")
|
|
def test_load_data_logs_and_skips_on_loader_error(mock_requests_get, mock_logging):
|
|
mock_requests_get.return_value = DummyResponse("<html><body>Error route</body></html>")
|
|
|
|
failing_loader_instance = MagicMock()
|
|
failing_loader_instance.load.side_effect = Exception("load failure")
|
|
|
|
crawler = CrawlerLoader()
|
|
crawler.loader = MagicMock(return_value=failing_loader_instance)
|
|
|
|
result = crawler.load_data("http://example.com")
|
|
|
|
assert result == []
|
|
mock_requests_get.assert_called_once_with("http://example.com")
|
|
failing_loader_instance.load.assert_called_once()
|
|
|
|
mock_logging.error.assert_called_once()
|
|
message, = mock_logging.error.call_args.args
|
|
assert "Error processing URL http://example.com" in message
|
|
assert mock_logging.error.call_args.kwargs.get("exc_info") is True
|
|
|