From 8c91b1c52738f378c998b4dadd70a5f77cb29f6a Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Mon, 29 Sep 2025 19:39:24 +0530 Subject: [PATCH] (tests:parsers) remote --- tests/parser/remote/test_crawler_loader.py | 167 ++++++++++ tests/parser/remote/test_crawler_markdown.py | 139 +++++++++ tests/parser/remote/test_web_loader.py | 303 +++++++++++++++++++ 3 files changed, 609 insertions(+) create mode 100644 tests/parser/remote/test_crawler_loader.py create mode 100644 tests/parser/remote/test_crawler_markdown.py create mode 100644 tests/parser/remote/test_web_loader.py diff --git a/tests/parser/remote/test_crawler_loader.py b/tests/parser/remote/test_crawler_loader.py new file mode 100644 index 00000000..0a100abb --- /dev/null +++ b/tests/parser/remote/test_crawler_loader.py @@ -0,0 +1,167 @@ +from unittest.mock import MagicMock, patch + +from application.parser.remote.crawler_loader import CrawlerLoader +from application.parser.schema.base import Document +from langchain.docstore.document import Document as LCDocument + + +class DummyResponse: + def __init__(self, text: str) -> None: + self.text = text + + def raise_for_status(self) -> None: + return None + + +@patch("application.parser.remote.crawler_loader.requests.get") +def test_load_data_crawls_same_domain_links(mock_requests_get): + responses = { + "http://example.com": DummyResponse( + """ + + + About + External + + + """ + ), + "http://example.com/about": DummyResponse("About page"), + } + + def response_side_effect(url: str): + if url not in responses: + raise AssertionError(f"Unexpected request for URL: {url}") + return responses[url] + + mock_requests_get.side_effect = response_side_effect + + root_doc = MagicMock(spec=LCDocument) + root_doc.page_content = "Root content" + root_doc.metadata = {"source": "http://example.com"} + + about_doc = MagicMock(spec=LCDocument) + about_doc.page_content = "About content" + about_doc.metadata = {"source": "http://example.com/about"} + + loader_instances = { + "http://example.com": MagicMock(), + "http://example.com/about": MagicMock(), + } + loader_instances["http://example.com"].load.return_value = [root_doc] + loader_instances["http://example.com/about"].load.return_value = [about_doc] + + loader_call_order = [] + + def loader_factory(url_list): + url = url_list[0] + loader_call_order.append(url) + return loader_instances[url] + + crawler = CrawlerLoader(limit=5) + crawler.loader = MagicMock(side_effect=loader_factory) + + result = crawler.load_data("http://example.com") + + assert len(result) == 2 + assert all(isinstance(doc, Document) for doc in result) + + sources = {doc.extra_info.get("source") for doc in result} + assert sources == {"http://example.com", "http://example.com/about"} + + texts = {doc.text for doc in result} + assert texts == {"Root content", "About content"} + + assert mock_requests_get.call_count == 2 + assert loader_call_order == ["http://example.com", "http://example.com/about"] + + +@patch("application.parser.remote.crawler_loader.requests.get") +def test_load_data_accepts_list_input_and_adds_scheme(mock_requests_get): + mock_requests_get.return_value = DummyResponse("No links here") + + doc = MagicMock(spec=LCDocument) + doc.page_content = "Homepage" + doc.metadata = {"source": "http://example.com"} + + loader_instance = MagicMock() + loader_instance.load.return_value = [doc] + + crawler = CrawlerLoader() + crawler.loader = MagicMock(return_value=loader_instance) + + result = crawler.load_data(["example.com", "unused.com"]) + + mock_requests_get.assert_called_once_with("http://example.com") + crawler.loader.assert_called_once_with(["http://example.com"]) + + assert len(result) == 1 + assert result[0].text == "Homepage" + assert result[0].extra_info == {"source": "http://example.com"} + + +@patch("application.parser.remote.crawler_loader.requests.get") +def test_load_data_respects_limit(mock_requests_get): + responses = { + "http://example.com": DummyResponse( + """ + + + About + + + """ + ), + "http://example.com/about": DummyResponse("About"), + } + + mock_requests_get.side_effect = lambda url: responses[url] + + root_doc = MagicMock(spec=LCDocument) + root_doc.page_content = "Root content" + root_doc.metadata = {"source": "http://example.com"} + + about_doc = MagicMock(spec=LCDocument) + about_doc.page_content = "About content" + about_doc.metadata = {"source": "http://example.com/about"} + + loader_instances = { + "http://example.com": MagicMock(), + "http://example.com/about": MagicMock(), + } + loader_instances["http://example.com"].load.return_value = [root_doc] + loader_instances["http://example.com/about"].load.return_value = [about_doc] + + crawler = CrawlerLoader(limit=1) + crawler.loader = MagicMock(side_effect=lambda url_list: loader_instances[url_list[0]]) + + result = crawler.load_data("http://example.com") + + assert len(result) == 1 + assert result[0].text == "Root content" + assert mock_requests_get.call_count == 1 + assert crawler.loader.call_count == 1 + + +@patch("application.parser.remote.crawler_loader.logging") +@patch("application.parser.remote.crawler_loader.requests.get") +def test_load_data_logs_and_skips_on_loader_error(mock_requests_get, mock_logging): + mock_requests_get.return_value = DummyResponse("Error route") + + failing_loader_instance = MagicMock() + failing_loader_instance.load.side_effect = Exception("load failure") + + crawler = CrawlerLoader() + crawler.loader = MagicMock(return_value=failing_loader_instance) + + result = crawler.load_data("http://example.com") + + assert result == [] + mock_requests_get.assert_called_once_with("http://example.com") + failing_loader_instance.load.assert_called_once() + + mock_logging.error.assert_called_once() + message, = mock_logging.error.call_args.args + assert "Error processing URL http://example.com" in message + assert mock_logging.error.call_args.kwargs.get("exc_info") is True + diff --git a/tests/parser/remote/test_crawler_markdown.py b/tests/parser/remote/test_crawler_markdown.py new file mode 100644 index 00000000..ac27b3d0 --- /dev/null +++ b/tests/parser/remote/test_crawler_markdown.py @@ -0,0 +1,139 @@ +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest +import requests + +from application.parser.remote.crawler_markdown import CrawlerLoader +from application.parser.schema.base import Document + + +class DummyResponse: + def __init__(self, text): + self.text = text + + def raise_for_status(self): + return None + + +def _fake_extract(value: str) -> SimpleNamespace: + value = value.split("//")[-1] + host = value.split("/")[0] + parts = host.split(".") + if len(parts) >= 2: + domain = parts[-2] + suffix = parts[-1] + else: + domain = host + suffix = "" + return SimpleNamespace(domain=domain, suffix=suffix) + + +@pytest.fixture(autouse=True) +def _patch_tldextract(monkeypatch): + monkeypatch.setattr( + "application.parser.remote.crawler_markdown.tldextract.extract", + _fake_extract, + ) + + +@pytest.fixture(autouse=True) +def _patch_markdownify(monkeypatch): + outputs = {} + + def fake_markdownify(html, *_, **__): + return outputs.get(html, html) + + monkeypatch.setattr( + "application.parser.remote.crawler_markdown.markdownify", + fake_markdownify, + ) + return outputs + + +def _setup_session(mock_get_side_effect): + session = MagicMock() + session.get.side_effect = mock_get_side_effect + return session + + +def test_load_data_filters_external_links(_patch_markdownify): + root_html = """ + Home + AboutOther

Welcome

+ + """ + about_html = "AboutAbout page" + + _patch_markdownify[root_html] = "Home Markdown" + _patch_markdownify[about_html] = "About Markdown" + + responses = { + "http://example.com": DummyResponse(root_html), + "http://example.com/about": DummyResponse(about_html), + } + + loader = CrawlerLoader(limit=5) + loader.session = _setup_session(lambda url, timeout=10: responses[url]) + + docs = loader.load_data("http://example.com") + + assert len(docs) == 2 + for doc in docs: + assert isinstance(doc, Document) + assert doc.extra_info["source"] in responses + texts = {doc.text for doc in docs} + assert texts == {"Home Markdown", "About Markdown"} + + +def test_load_data_allows_subdomains(_patch_markdownify): + root_html = """ + Home + Blog + + """ + blog_html = "BlogBlog post" + + _patch_markdownify[root_html] = "Home Markdown" + _patch_markdownify[blog_html] = "Blog Markdown" + + responses = { + "http://example.com": DummyResponse(root_html), + "http://blog.example.com/post": DummyResponse(blog_html), + } + + loader = CrawlerLoader(limit=5, allow_subdomains=True) + loader.session = _setup_session(lambda url, timeout=10: responses[url]) + + docs = loader.load_data("http://example.com") + + sources = {doc.extra_info["source"] for doc in docs} + assert "http://blog.example.com/post" in sources + assert len(docs) == 2 + + +def test_load_data_handles_fetch_errors(monkeypatch, _patch_markdownify): + root_html = """ + Home + About + + """ + + _patch_markdownify[root_html] = "Home Markdown" + + def side_effect(url, timeout=10): + if url == "http://example.com": + return DummyResponse(root_html) + raise requests.exceptions.RequestException("boom") + + loader = CrawlerLoader(limit=5) + loader.session = _setup_session(side_effect) + mock_print = MagicMock() + monkeypatch.setattr("builtins.print", mock_print) + + docs = loader.load_data("http://example.com") + + assert len(docs) == 1 + assert docs[0].text == "Home Markdown" + assert mock_print.called + diff --git a/tests/parser/remote/test_web_loader.py b/tests/parser/remote/test_web_loader.py new file mode 100644 index 00000000..ca539f0a --- /dev/null +++ b/tests/parser/remote/test_web_loader.py @@ -0,0 +1,303 @@ +import pytest +from unittest.mock import patch, MagicMock +from urllib.parse import urlparse + +from application.parser.remote.web_loader import WebLoader, headers +from application.parser.schema.base import Document +from langchain.docstore.document import Document as LCDocument + + +@pytest.fixture +def web_loader(): + return WebLoader() + + +@pytest.fixture +def mock_langchain_document(): + """Create a mock LangChain document.""" + doc = MagicMock(spec=LCDocument) + doc.page_content = "Test web page content" + doc.metadata = {"source": "https://example.com", "title": "Test Page"} + return doc + + +@pytest.fixture +def mock_web_base_loader(): + """Create a mock WebBaseLoader class.""" + mock_loader_class = MagicMock() + mock_loader_instance = MagicMock() + mock_loader_class.return_value = mock_loader_instance + return mock_loader_class, mock_loader_instance + + +class TestWebLoaderInitialization: + """Test WebLoader initialization.""" + + def test_init(self, web_loader): + """Test WebLoader initialization.""" + assert web_loader.loader is not None + from langchain_community.document_loaders import WebBaseLoader + assert web_loader.loader == WebBaseLoader + + +class TestWebLoaderHeaders: + """Test WebLoader headers configuration.""" + + def test_headers_defined(self): + """Test that headers are properly defined.""" + assert isinstance(headers, dict) + assert "User-Agent" in headers + assert "Accept" in headers + assert "Accept-Language" in headers + assert "Referer" in headers + assert "DNT" in headers + assert "Connection" in headers + assert "Upgrade-Insecure-Requests" in headers + + def test_headers_values(self): + """Test header values are reasonable.""" + assert headers["User-Agent"] == "Mozilla/5.0" + assert "text/html" in headers["Accept"] + assert headers["Referer"] == "https://www.google.com/" + assert headers["DNT"] == "1" + assert headers["Connection"] == "keep-alive" + + +class TestWebLoaderLoadData: + """Test WebLoader load_data method.""" + + def test_load_data_single_url_string(self, web_loader, mock_langchain_document): + """Test loading data from a single URL passed as string.""" + + mock_loader_instance = MagicMock() + mock_loader_instance.load.return_value = [mock_langchain_document] + + mock_web_base_loader_class = MagicMock() + mock_web_base_loader_class.return_value = mock_loader_instance + + web_loader.loader = mock_web_base_loader_class + + result = web_loader.load_data("https://example.com") + + assert len(result) == 1 + assert isinstance(result[0], Document) + assert result[0].text == "Test web page content" + assert result[0].extra_info == {"source": "https://example.com", "title": "Test Page"} + + mock_web_base_loader_class.assert_called_once_with(["https://example.com"], header_template=headers) + mock_loader_instance.load.assert_called_once() + + def test_load_data_multiple_urls_list(self, web_loader): + """Test loading data from multiple URLs passed as list.""" + + doc1 = MagicMock(spec=LCDocument) + doc1.page_content = "Content from site 1" + doc1.metadata = {"source": "https://site1.com"} + + doc2 = MagicMock(spec=LCDocument) + doc2.page_content = "Content from site 2" + doc2.metadata = {"source": "https://site2.com"} + + + mock_loader_instance1 = MagicMock() + mock_loader_instance1.load.return_value = [doc1] + + mock_loader_instance2 = MagicMock() + mock_loader_instance2.load.return_value = [doc2] + + mock_web_base_loader_class = MagicMock() + mock_web_base_loader_class.side_effect = [mock_loader_instance1, mock_loader_instance2] + + web_loader.loader = mock_web_base_loader_class + + urls = ["https://site1.com", "https://site2.com"] + result = web_loader.load_data(urls) + + assert len(result) == 2 + assert all(isinstance(doc, Document) for doc in result) + assert result[0].text == "Content from site 1" + assert result[1].text == "Content from site 2" + assert result[0].extra_info == {"source": "https://site1.com"} + assert result[1].extra_info == {"source": "https://site2.com"} + + assert mock_web_base_loader_class.call_count == 2 + mock_web_base_loader_class.assert_any_call(["https://site1.com"], header_template=headers) + mock_web_base_loader_class.assert_any_call(["https://site2.com"], header_template=headers) + + def test_load_data_url_without_scheme(self, web_loader, mock_langchain_document): + """Test loading data from URL without scheme (should add http://).""" + mock_loader_instance = MagicMock() + mock_loader_instance.load.return_value = [mock_langchain_document] + + mock_web_base_loader_class = MagicMock() + mock_web_base_loader_class.return_value = mock_loader_instance + + web_loader.loader = mock_web_base_loader_class + + result = web_loader.load_data("example.com") + + assert len(result) == 1 + assert isinstance(result[0], Document) + + # Verify WebBaseLoader was called with http:// prefix + mock_web_base_loader_class.assert_called_once_with(["http://example.com"], header_template=headers) + + def test_load_data_url_with_scheme(self, web_loader, mock_langchain_document): + """Test loading data from URL with scheme (should not modify).""" + mock_loader_instance = MagicMock() + mock_loader_instance.load.return_value = [mock_langchain_document] + + mock_web_base_loader_class = MagicMock() + mock_web_base_loader_class.return_value = mock_loader_instance + + web_loader.loader = mock_web_base_loader_class + + result = web_loader.load_data("https://example.com") + + assert len(result) == 1 + + # Verify WebBaseLoader was called with original URL + mock_web_base_loader_class.assert_called_once_with(["https://example.com"], header_template=headers) + + def test_load_data_multiple_documents_per_url(self, web_loader): + """Test loading multiple documents from a single URL.""" + doc1 = MagicMock(spec=LCDocument) + doc1.page_content = "First document content" + doc1.metadata = {"source": "https://example.com", "section": "intro"} + + doc2 = MagicMock(spec=LCDocument) + doc2.page_content = "Second document content" + doc2.metadata = {"source": "https://example.com", "section": "main"} + + mock_loader_instance = MagicMock() + mock_loader_instance.load.return_value = [doc1, doc2] + + mock_web_base_loader_class = MagicMock() + mock_web_base_loader_class.return_value = mock_loader_instance + + web_loader.loader = mock_web_base_loader_class + + result = web_loader.load_data("https://example.com") + + assert len(result) == 2 + assert result[0].text == "First document content" + assert result[1].text == "Second document content" + assert result[0].extra_info == {"source": "https://example.com", "section": "intro"} + assert result[1].extra_info == {"source": "https://example.com", "section": "main"} + + +class TestWebLoaderErrorHandling: + """Test WebLoader error handling.""" + + @patch('application.parser.remote.web_loader.logging') + def test_load_data_single_url_error(self, mock_logging, web_loader): + """Test error handling for single URL that fails to load.""" + mock_loader_instance = MagicMock() + mock_loader_instance.load.side_effect = Exception("Network error") + + mock_web_base_loader_class = MagicMock() + mock_web_base_loader_class.return_value = mock_loader_instance + + web_loader.loader = mock_web_base_loader_class + + result = web_loader.load_data("https://invalid-url.com") + + assert result == [] # Should return empty list on error + mock_logging.error.assert_called_once() + error_call = mock_logging.error.call_args + assert "Error processing URL https://invalid-url.com" in error_call[0][0] + assert error_call[1]["exc_info"] is True + + @patch('application.parser.remote.web_loader.logging') + def test_load_data_partial_failure(self, mock_logging, web_loader): + """Test partial failure - some URLs succeed, some fail.""" + doc1 = MagicMock(spec=LCDocument) + doc1.page_content = "Success content" + doc1.metadata = {"source": "https://good-url.com"} + + mock_loader_instance1 = MagicMock() + mock_loader_instance1.load.return_value = [doc1] + + mock_loader_instance2 = MagicMock() + mock_loader_instance2.load.side_effect = Exception("Network error") + + mock_web_base_loader_class = MagicMock() + mock_web_base_loader_class.side_effect = [mock_loader_instance1, mock_loader_instance2] + + web_loader.loader = mock_web_base_loader_class + + urls = ["https://good-url.com", "https://bad-url.com"] + result = web_loader.load_data(urls) + + assert len(result) == 1 # Only successful URL should be in results + assert result[0].text == "Success content" + assert result[0].extra_info == {"source": "https://good-url.com"} + + mock_logging.error.assert_called_once() + error_call = mock_logging.error.call_args + assert "Error processing URL https://bad-url.com" in error_call[0][0] + + +class TestWebLoaderEdgeCases: + """Test WebLoader edge cases.""" + + def test_load_data_empty_list(self, web_loader): + """Test loading data with empty URL list.""" + result = web_loader.load_data([]) + assert result == [] + + def test_load_data_empty_response(self, web_loader): + """Test loading data when WebBaseLoader returns empty list.""" + mock_loader_instance = MagicMock() + mock_loader_instance.load.return_value = [] + + mock_web_base_loader_class = MagicMock() + mock_web_base_loader_class.return_value = mock_loader_instance + + web_loader.loader = mock_web_base_loader_class + + result = web_loader.load_data("https://empty-page.com") + + assert result == [] + + def test_url_scheme_detection(self): + """Test URL scheme detection logic.""" + # Test URLs with schemes + assert urlparse("https://example.com").scheme == "https" + assert urlparse("http://example.com").scheme == "http" + assert urlparse("ftp://example.com").scheme == "ftp" + + # Test URLs without schemes + assert urlparse("example.com").scheme == "" + assert urlparse("www.example.com").scheme == "" + + +class TestWebLoaderIntegration: + """Test WebLoader integration with base class.""" + + def test_inherits_from_base_remote(self, web_loader): + """Test that WebLoader inherits from BaseRemote.""" + from application.parser.remote.base import BaseRemote + assert isinstance(web_loader, BaseRemote) + + def test_implements_load_data_method(self, web_loader): + """Test that WebLoader implements required load_data method.""" + assert hasattr(web_loader, 'load_data') + assert callable(web_loader.load_data) + + def test_load_langchain_documents_method(self, web_loader, mock_langchain_document): + """Test inherited load_langchain_documents method.""" + mock_loader_instance = MagicMock() + mock_loader_instance.load.return_value = [mock_langchain_document] + + mock_web_base_loader_class = MagicMock() + mock_web_base_loader_class.return_value = mock_loader_instance + + web_loader.loader = mock_web_base_loader_class + + result = web_loader.load_langchain_documents(inputs="https://example.com") + + assert len(result) == 1 + assert isinstance(result[0], LCDocument) + assert result[0].page_content == "Test web page content" + assert result[0].metadata == {"source": "https://example.com", "title": "Test Page"}