(tests:parsers) remote

This commit is contained in:
ManishMadan2882
2025-09-29 19:39:24 +05:30
parent 2b38f80d04
commit 8c91b1c527
3 changed files with 609 additions and 0 deletions

View File

@@ -0,0 +1,167 @@
from unittest.mock import MagicMock, patch
from application.parser.remote.crawler_loader import CrawlerLoader
from application.parser.schema.base import Document
from langchain.docstore.document import Document as LCDocument
class DummyResponse:
def __init__(self, text: str) -> None:
self.text = text
def raise_for_status(self) -> None:
return None
@patch("application.parser.remote.crawler_loader.requests.get")
def test_load_data_crawls_same_domain_links(mock_requests_get):
responses = {
"http://example.com": DummyResponse(
"""
<html>
<body>
<a href='/about'>About</a>
<a href='https://external.com/news'>External</a>
</body>
</html>
"""
),
"http://example.com/about": DummyResponse("<html><body>About page</body></html>"),
}
def response_side_effect(url: str):
if url not in responses:
raise AssertionError(f"Unexpected request for URL: {url}")
return responses[url]
mock_requests_get.side_effect = response_side_effect
root_doc = MagicMock(spec=LCDocument)
root_doc.page_content = "Root content"
root_doc.metadata = {"source": "http://example.com"}
about_doc = MagicMock(spec=LCDocument)
about_doc.page_content = "About content"
about_doc.metadata = {"source": "http://example.com/about"}
loader_instances = {
"http://example.com": MagicMock(),
"http://example.com/about": MagicMock(),
}
loader_instances["http://example.com"].load.return_value = [root_doc]
loader_instances["http://example.com/about"].load.return_value = [about_doc]
loader_call_order = []
def loader_factory(url_list):
url = url_list[0]
loader_call_order.append(url)
return loader_instances[url]
crawler = CrawlerLoader(limit=5)
crawler.loader = MagicMock(side_effect=loader_factory)
result = crawler.load_data("http://example.com")
assert len(result) == 2
assert all(isinstance(doc, Document) for doc in result)
sources = {doc.extra_info.get("source") for doc in result}
assert sources == {"http://example.com", "http://example.com/about"}
texts = {doc.text for doc in result}
assert texts == {"Root content", "About content"}
assert mock_requests_get.call_count == 2
assert loader_call_order == ["http://example.com", "http://example.com/about"]
@patch("application.parser.remote.crawler_loader.requests.get")
def test_load_data_accepts_list_input_and_adds_scheme(mock_requests_get):
mock_requests_get.return_value = DummyResponse("<html><body>No links here</body></html>")
doc = MagicMock(spec=LCDocument)
doc.page_content = "Homepage"
doc.metadata = {"source": "http://example.com"}
loader_instance = MagicMock()
loader_instance.load.return_value = [doc]
crawler = CrawlerLoader()
crawler.loader = MagicMock(return_value=loader_instance)
result = crawler.load_data(["example.com", "unused.com"])
mock_requests_get.assert_called_once_with("http://example.com")
crawler.loader.assert_called_once_with(["http://example.com"])
assert len(result) == 1
assert result[0].text == "Homepage"
assert result[0].extra_info == {"source": "http://example.com"}
@patch("application.parser.remote.crawler_loader.requests.get")
def test_load_data_respects_limit(mock_requests_get):
responses = {
"http://example.com": DummyResponse(
"""
<html>
<body>
<a href='/about'>About</a>
</body>
</html>
"""
),
"http://example.com/about": DummyResponse("<html><body>About</body></html>"),
}
mock_requests_get.side_effect = lambda url: responses[url]
root_doc = MagicMock(spec=LCDocument)
root_doc.page_content = "Root content"
root_doc.metadata = {"source": "http://example.com"}
about_doc = MagicMock(spec=LCDocument)
about_doc.page_content = "About content"
about_doc.metadata = {"source": "http://example.com/about"}
loader_instances = {
"http://example.com": MagicMock(),
"http://example.com/about": MagicMock(),
}
loader_instances["http://example.com"].load.return_value = [root_doc]
loader_instances["http://example.com/about"].load.return_value = [about_doc]
crawler = CrawlerLoader(limit=1)
crawler.loader = MagicMock(side_effect=lambda url_list: loader_instances[url_list[0]])
result = crawler.load_data("http://example.com")
assert len(result) == 1
assert result[0].text == "Root content"
assert mock_requests_get.call_count == 1
assert crawler.loader.call_count == 1
@patch("application.parser.remote.crawler_loader.logging")
@patch("application.parser.remote.crawler_loader.requests.get")
def test_load_data_logs_and_skips_on_loader_error(mock_requests_get, mock_logging):
mock_requests_get.return_value = DummyResponse("<html><body>Error route</body></html>")
failing_loader_instance = MagicMock()
failing_loader_instance.load.side_effect = Exception("load failure")
crawler = CrawlerLoader()
crawler.loader = MagicMock(return_value=failing_loader_instance)
result = crawler.load_data("http://example.com")
assert result == []
mock_requests_get.assert_called_once_with("http://example.com")
failing_loader_instance.load.assert_called_once()
mock_logging.error.assert_called_once()
message, = mock_logging.error.call_args.args
assert "Error processing URL http://example.com" in message
assert mock_logging.error.call_args.kwargs.get("exc_info") is True

View File

@@ -0,0 +1,139 @@
from types import SimpleNamespace
from unittest.mock import MagicMock
import pytest
import requests
from application.parser.remote.crawler_markdown import CrawlerLoader
from application.parser.schema.base import Document
class DummyResponse:
def __init__(self, text):
self.text = text
def raise_for_status(self):
return None
def _fake_extract(value: str) -> SimpleNamespace:
value = value.split("//")[-1]
host = value.split("/")[0]
parts = host.split(".")
if len(parts) >= 2:
domain = parts[-2]
suffix = parts[-1]
else:
domain = host
suffix = ""
return SimpleNamespace(domain=domain, suffix=suffix)
@pytest.fixture(autouse=True)
def _patch_tldextract(monkeypatch):
monkeypatch.setattr(
"application.parser.remote.crawler_markdown.tldextract.extract",
_fake_extract,
)
@pytest.fixture(autouse=True)
def _patch_markdownify(monkeypatch):
outputs = {}
def fake_markdownify(html, *_, **__):
return outputs.get(html, html)
monkeypatch.setattr(
"application.parser.remote.crawler_markdown.markdownify",
fake_markdownify,
)
return outputs
def _setup_session(mock_get_side_effect):
session = MagicMock()
session.get.side_effect = mock_get_side_effect
return session
def test_load_data_filters_external_links(_patch_markdownify):
root_html = """
<html><head><title>Home</title></head>
<body><a href="/about">About</a><a href="https://other.com">Other</a><p>Welcome</p></body>
</html>
"""
about_html = "<html><head><title>About</title></head><body>About page</body></html>"
_patch_markdownify[root_html] = "Home Markdown"
_patch_markdownify[about_html] = "About Markdown"
responses = {
"http://example.com": DummyResponse(root_html),
"http://example.com/about": DummyResponse(about_html),
}
loader = CrawlerLoader(limit=5)
loader.session = _setup_session(lambda url, timeout=10: responses[url])
docs = loader.load_data("http://example.com")
assert len(docs) == 2
for doc in docs:
assert isinstance(doc, Document)
assert doc.extra_info["source"] in responses
texts = {doc.text for doc in docs}
assert texts == {"Home Markdown", "About Markdown"}
def test_load_data_allows_subdomains(_patch_markdownify):
root_html = """
<html><head><title>Home</title></head>
<body><a href="http://blog.example.com/post">Blog</a></body>
</html>
"""
blog_html = "<html><head><title>Blog</title></head><body>Blog post</body></html>"
_patch_markdownify[root_html] = "Home Markdown"
_patch_markdownify[blog_html] = "Blog Markdown"
responses = {
"http://example.com": DummyResponse(root_html),
"http://blog.example.com/post": DummyResponse(blog_html),
}
loader = CrawlerLoader(limit=5, allow_subdomains=True)
loader.session = _setup_session(lambda url, timeout=10: responses[url])
docs = loader.load_data("http://example.com")
sources = {doc.extra_info["source"] for doc in docs}
assert "http://blog.example.com/post" in sources
assert len(docs) == 2
def test_load_data_handles_fetch_errors(monkeypatch, _patch_markdownify):
root_html = """
<html><head><title>Home</title></head>
<body><a href="/about">About</a></body>
</html>
"""
_patch_markdownify[root_html] = "Home Markdown"
def side_effect(url, timeout=10):
if url == "http://example.com":
return DummyResponse(root_html)
raise requests.exceptions.RequestException("boom")
loader = CrawlerLoader(limit=5)
loader.session = _setup_session(side_effect)
mock_print = MagicMock()
monkeypatch.setattr("builtins.print", mock_print)
docs = loader.load_data("http://example.com")
assert len(docs) == 1
assert docs[0].text == "Home Markdown"
assert mock_print.called

View File

@@ -0,0 +1,303 @@
import pytest
from unittest.mock import patch, MagicMock
from urllib.parse import urlparse
from application.parser.remote.web_loader import WebLoader, headers
from application.parser.schema.base import Document
from langchain.docstore.document import Document as LCDocument
@pytest.fixture
def web_loader():
return WebLoader()
@pytest.fixture
def mock_langchain_document():
"""Create a mock LangChain document."""
doc = MagicMock(spec=LCDocument)
doc.page_content = "Test web page content"
doc.metadata = {"source": "https://example.com", "title": "Test Page"}
return doc
@pytest.fixture
def mock_web_base_loader():
"""Create a mock WebBaseLoader class."""
mock_loader_class = MagicMock()
mock_loader_instance = MagicMock()
mock_loader_class.return_value = mock_loader_instance
return mock_loader_class, mock_loader_instance
class TestWebLoaderInitialization:
"""Test WebLoader initialization."""
def test_init(self, web_loader):
"""Test WebLoader initialization."""
assert web_loader.loader is not None
from langchain_community.document_loaders import WebBaseLoader
assert web_loader.loader == WebBaseLoader
class TestWebLoaderHeaders:
"""Test WebLoader headers configuration."""
def test_headers_defined(self):
"""Test that headers are properly defined."""
assert isinstance(headers, dict)
assert "User-Agent" in headers
assert "Accept" in headers
assert "Accept-Language" in headers
assert "Referer" in headers
assert "DNT" in headers
assert "Connection" in headers
assert "Upgrade-Insecure-Requests" in headers
def test_headers_values(self):
"""Test header values are reasonable."""
assert headers["User-Agent"] == "Mozilla/5.0"
assert "text/html" in headers["Accept"]
assert headers["Referer"] == "https://www.google.com/"
assert headers["DNT"] == "1"
assert headers["Connection"] == "keep-alive"
class TestWebLoaderLoadData:
"""Test WebLoader load_data method."""
def test_load_data_single_url_string(self, web_loader, mock_langchain_document):
"""Test loading data from a single URL passed as string."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [mock_langchain_document]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("https://example.com")
assert len(result) == 1
assert isinstance(result[0], Document)
assert result[0].text == "Test web page content"
assert result[0].extra_info == {"source": "https://example.com", "title": "Test Page"}
mock_web_base_loader_class.assert_called_once_with(["https://example.com"], header_template=headers)
mock_loader_instance.load.assert_called_once()
def test_load_data_multiple_urls_list(self, web_loader):
"""Test loading data from multiple URLs passed as list."""
doc1 = MagicMock(spec=LCDocument)
doc1.page_content = "Content from site 1"
doc1.metadata = {"source": "https://site1.com"}
doc2 = MagicMock(spec=LCDocument)
doc2.page_content = "Content from site 2"
doc2.metadata = {"source": "https://site2.com"}
mock_loader_instance1 = MagicMock()
mock_loader_instance1.load.return_value = [doc1]
mock_loader_instance2 = MagicMock()
mock_loader_instance2.load.return_value = [doc2]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.side_effect = [mock_loader_instance1, mock_loader_instance2]
web_loader.loader = mock_web_base_loader_class
urls = ["https://site1.com", "https://site2.com"]
result = web_loader.load_data(urls)
assert len(result) == 2
assert all(isinstance(doc, Document) for doc in result)
assert result[0].text == "Content from site 1"
assert result[1].text == "Content from site 2"
assert result[0].extra_info == {"source": "https://site1.com"}
assert result[1].extra_info == {"source": "https://site2.com"}
assert mock_web_base_loader_class.call_count == 2
mock_web_base_loader_class.assert_any_call(["https://site1.com"], header_template=headers)
mock_web_base_loader_class.assert_any_call(["https://site2.com"], header_template=headers)
def test_load_data_url_without_scheme(self, web_loader, mock_langchain_document):
"""Test loading data from URL without scheme (should add http://)."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [mock_langchain_document]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("example.com")
assert len(result) == 1
assert isinstance(result[0], Document)
# Verify WebBaseLoader was called with http:// prefix
mock_web_base_loader_class.assert_called_once_with(["http://example.com"], header_template=headers)
def test_load_data_url_with_scheme(self, web_loader, mock_langchain_document):
"""Test loading data from URL with scheme (should not modify)."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [mock_langchain_document]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("https://example.com")
assert len(result) == 1
# Verify WebBaseLoader was called with original URL
mock_web_base_loader_class.assert_called_once_with(["https://example.com"], header_template=headers)
def test_load_data_multiple_documents_per_url(self, web_loader):
"""Test loading multiple documents from a single URL."""
doc1 = MagicMock(spec=LCDocument)
doc1.page_content = "First document content"
doc1.metadata = {"source": "https://example.com", "section": "intro"}
doc2 = MagicMock(spec=LCDocument)
doc2.page_content = "Second document content"
doc2.metadata = {"source": "https://example.com", "section": "main"}
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [doc1, doc2]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("https://example.com")
assert len(result) == 2
assert result[0].text == "First document content"
assert result[1].text == "Second document content"
assert result[0].extra_info == {"source": "https://example.com", "section": "intro"}
assert result[1].extra_info == {"source": "https://example.com", "section": "main"}
class TestWebLoaderErrorHandling:
"""Test WebLoader error handling."""
@patch('application.parser.remote.web_loader.logging')
def test_load_data_single_url_error(self, mock_logging, web_loader):
"""Test error handling for single URL that fails to load."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.side_effect = Exception("Network error")
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("https://invalid-url.com")
assert result == [] # Should return empty list on error
mock_logging.error.assert_called_once()
error_call = mock_logging.error.call_args
assert "Error processing URL https://invalid-url.com" in error_call[0][0]
assert error_call[1]["exc_info"] is True
@patch('application.parser.remote.web_loader.logging')
def test_load_data_partial_failure(self, mock_logging, web_loader):
"""Test partial failure - some URLs succeed, some fail."""
doc1 = MagicMock(spec=LCDocument)
doc1.page_content = "Success content"
doc1.metadata = {"source": "https://good-url.com"}
mock_loader_instance1 = MagicMock()
mock_loader_instance1.load.return_value = [doc1]
mock_loader_instance2 = MagicMock()
mock_loader_instance2.load.side_effect = Exception("Network error")
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.side_effect = [mock_loader_instance1, mock_loader_instance2]
web_loader.loader = mock_web_base_loader_class
urls = ["https://good-url.com", "https://bad-url.com"]
result = web_loader.load_data(urls)
assert len(result) == 1 # Only successful URL should be in results
assert result[0].text == "Success content"
assert result[0].extra_info == {"source": "https://good-url.com"}
mock_logging.error.assert_called_once()
error_call = mock_logging.error.call_args
assert "Error processing URL https://bad-url.com" in error_call[0][0]
class TestWebLoaderEdgeCases:
"""Test WebLoader edge cases."""
def test_load_data_empty_list(self, web_loader):
"""Test loading data with empty URL list."""
result = web_loader.load_data([])
assert result == []
def test_load_data_empty_response(self, web_loader):
"""Test loading data when WebBaseLoader returns empty list."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = []
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("https://empty-page.com")
assert result == []
def test_url_scheme_detection(self):
"""Test URL scheme detection logic."""
# Test URLs with schemes
assert urlparse("https://example.com").scheme == "https"
assert urlparse("http://example.com").scheme == "http"
assert urlparse("ftp://example.com").scheme == "ftp"
# Test URLs without schemes
assert urlparse("example.com").scheme == ""
assert urlparse("www.example.com").scheme == ""
class TestWebLoaderIntegration:
"""Test WebLoader integration with base class."""
def test_inherits_from_base_remote(self, web_loader):
"""Test that WebLoader inherits from BaseRemote."""
from application.parser.remote.base import BaseRemote
assert isinstance(web_loader, BaseRemote)
def test_implements_load_data_method(self, web_loader):
"""Test that WebLoader implements required load_data method."""
assert hasattr(web_loader, 'load_data')
assert callable(web_loader.load_data)
def test_load_langchain_documents_method(self, web_loader, mock_langchain_document):
"""Test inherited load_langchain_documents method."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [mock_langchain_document]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_langchain_documents(inputs="https://example.com")
assert len(result) == 1
assert isinstance(result[0], LCDocument)
assert result[0].page_content == "Test web page content"
assert result[0].metadata == {"source": "https://example.com", "title": "Test Page"}