from unittest.mock import MagicMock, patch from application.parser.remote.crawler_loader import CrawlerLoader from application.parser.schema.base import Document from langchain.docstore.document import Document as LCDocument class DummyResponse: def __init__(self, text: str) -> None: self.text = text def raise_for_status(self) -> None: return None @patch("application.parser.remote.crawler_loader.requests.get") def test_load_data_crawls_same_domain_links(mock_requests_get): responses = { "http://example.com": DummyResponse( """ About External """ ), "http://example.com/about": DummyResponse("About page"), } def response_side_effect(url: str): if url not in responses: raise AssertionError(f"Unexpected request for URL: {url}") return responses[url] mock_requests_get.side_effect = response_side_effect root_doc = MagicMock(spec=LCDocument) root_doc.page_content = "Root content" root_doc.metadata = {"source": "http://example.com"} about_doc = MagicMock(spec=LCDocument) about_doc.page_content = "About content" about_doc.metadata = {"source": "http://example.com/about"} loader_instances = { "http://example.com": MagicMock(), "http://example.com/about": MagicMock(), } loader_instances["http://example.com"].load.return_value = [root_doc] loader_instances["http://example.com/about"].load.return_value = [about_doc] loader_call_order = [] def loader_factory(url_list): url = url_list[0] loader_call_order.append(url) return loader_instances[url] crawler = CrawlerLoader(limit=5) crawler.loader = MagicMock(side_effect=loader_factory) result = crawler.load_data("http://example.com") assert len(result) == 2 assert all(isinstance(doc, Document) for doc in result) sources = {doc.extra_info.get("source") for doc in result} assert sources == {"http://example.com", "http://example.com/about"} texts = {doc.text for doc in result} assert texts == {"Root content", "About content"} assert mock_requests_get.call_count == 2 assert loader_call_order == ["http://example.com", "http://example.com/about"] @patch("application.parser.remote.crawler_loader.requests.get") def test_load_data_accepts_list_input_and_adds_scheme(mock_requests_get): mock_requests_get.return_value = DummyResponse("No links here") doc = MagicMock(spec=LCDocument) doc.page_content = "Homepage" doc.metadata = {"source": "http://example.com"} loader_instance = MagicMock() loader_instance.load.return_value = [doc] crawler = CrawlerLoader() crawler.loader = MagicMock(return_value=loader_instance) result = crawler.load_data(["example.com", "unused.com"]) mock_requests_get.assert_called_once_with("http://example.com") crawler.loader.assert_called_once_with(["http://example.com"]) assert len(result) == 1 assert result[0].text == "Homepage" assert result[0].extra_info == {"source": "http://example.com"} @patch("application.parser.remote.crawler_loader.requests.get") def test_load_data_respects_limit(mock_requests_get): responses = { "http://example.com": DummyResponse( """ About """ ), "http://example.com/about": DummyResponse("About"), } mock_requests_get.side_effect = lambda url: responses[url] root_doc = MagicMock(spec=LCDocument) root_doc.page_content = "Root content" root_doc.metadata = {"source": "http://example.com"} about_doc = MagicMock(spec=LCDocument) about_doc.page_content = "About content" about_doc.metadata = {"source": "http://example.com/about"} loader_instances = { "http://example.com": MagicMock(), "http://example.com/about": MagicMock(), } loader_instances["http://example.com"].load.return_value = [root_doc] loader_instances["http://example.com/about"].load.return_value = [about_doc] crawler = CrawlerLoader(limit=1) crawler.loader = MagicMock(side_effect=lambda url_list: loader_instances[url_list[0]]) result = crawler.load_data("http://example.com") assert len(result) == 1 assert result[0].text == "Root content" assert mock_requests_get.call_count == 1 assert crawler.loader.call_count == 1 @patch("application.parser.remote.crawler_loader.logging") @patch("application.parser.remote.crawler_loader.requests.get") def test_load_data_logs_and_skips_on_loader_error(mock_requests_get, mock_logging): mock_requests_get.return_value = DummyResponse("Error route") failing_loader_instance = MagicMock() failing_loader_instance.load.side_effect = Exception("load failure") crawler = CrawlerLoader() crawler.loader = MagicMock(return_value=failing_loader_instance) result = crawler.load_data("http://example.com") assert result == [] mock_requests_get.assert_called_once_with("http://example.com") failing_loader_instance.load.assert_called_once() mock_logging.error.assert_called_once() message, = mock_logging.error.call_args.args assert "Error processing URL http://example.com" in message assert mock_logging.error.call_args.kwargs.get("exc_info") is True