Files
DocsGPT/tests/parser/remote/test_web_loader.py
Alex 81b6ee5daa Pg 4 (#2390)
* feat: postgres tests

* feat: mongo cutoff

* feat: mongo cutoff

* feat: adjust docs and compose files

* fix: mini code mongo removals

* fix: tests and k8s mongo stuff

* feat: test fixes

* fix: ruff

* fix: vale

* Potential fix for pull request finding 'CodeQL / Clear-text logging of sensitive information'

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>

* fix: mini suggestions

* vale lint fix 2

* fix: codeql columns thing

* fix: test mongo

* fix: tests coverage

* feat: better tests 4

* feat: more tests

* feat: decent coverage

* fix: ruff fixes

* fix: remove mongo mock

* feat: enhance workflow engine and API routes; add document retrieval and source handling

* feat: e2e tests

* fix: mcp, mongo and more

* fix: mini codeql warning

* fix: agent chunk view

* fix: mini issues

* fix: more pg fixes

* feat: postgres prep on start

* feat: qa tests

* fix: mini improvements

* fix: tests

---------

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
Co-authored-by: Siddhant Rai <siddhant.rai.5686@gmail.com>
2026-04-18 13:13:57 +01:00

380 lines
15 KiB
Python

import pytest
from unittest.mock import patch, MagicMock
from urllib.parse import urlparse
from application.core.url_validation import SSRFError
from application.parser.remote.web_loader import WebLoader, headers
from application.parser.schema.base import Document
from langchain_core.documents import Document as LCDocument
def _mock_validate_url(url):
"""Mock validate_url that allows test URLs through and prepends scheme like the real impl."""
if not urlparse(url).scheme:
url = "http://" + url
return url
@pytest.fixture
def web_loader():
return WebLoader()
@pytest.fixture
def mock_langchain_document():
"""Create a mock LangChain document."""
doc = MagicMock(spec=LCDocument)
doc.page_content = "Test web page content"
doc.metadata = {"source": "https://example.com", "title": "Test Page"}
return doc
@pytest.fixture
def mock_web_base_loader():
"""Create a mock WebBaseLoader class."""
mock_loader_class = MagicMock()
mock_loader_instance = MagicMock()
mock_loader_class.return_value = mock_loader_instance
return mock_loader_class, mock_loader_instance
class TestWebLoaderInitialization:
"""Test WebLoader initialization."""
def test_init(self, web_loader):
"""Test WebLoader initialization."""
assert web_loader.loader is not None
from langchain_community.document_loaders import WebBaseLoader
assert web_loader.loader == WebBaseLoader
class TestWebLoaderHeaders:
"""Test WebLoader headers configuration."""
def test_headers_defined(self):
"""Test that headers are properly defined."""
assert isinstance(headers, dict)
assert "User-Agent" in headers
assert "Accept" in headers
assert "Accept-Language" in headers
assert "Referer" in headers
assert "DNT" in headers
assert "Connection" in headers
assert "Upgrade-Insecure-Requests" in headers
def test_headers_values(self):
"""Test header values are reasonable."""
assert headers["User-Agent"] == "Mozilla/5.0"
assert "text/html" in headers["Accept"]
assert headers["Referer"] == "https://www.google.com/"
assert headers["DNT"] == "1"
assert headers["Connection"] == "keep-alive"
class TestWebLoaderLoadData:
"""Test WebLoader load_data method."""
@patch("application.parser.remote.web_loader.validate_url", side_effect=_mock_validate_url)
def test_load_data_single_url_string(self, mock_validate, web_loader, mock_langchain_document):
"""Test loading data from a single URL passed as string."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [mock_langchain_document]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("https://example.com")
assert len(result) == 1
assert isinstance(result[0], Document)
assert result[0].text == "Test web page content"
assert result[0].extra_info == {"source": "https://example.com", "title": "Test Page"}
mock_web_base_loader_class.assert_called_once_with(["https://example.com"], header_template=headers)
mock_loader_instance.load.assert_called_once()
@patch("application.parser.remote.web_loader.validate_url", side_effect=_mock_validate_url)
def test_load_data_multiple_urls_list(self, mock_validate, web_loader):
"""Test loading data from multiple URLs passed as list."""
doc1 = MagicMock(spec=LCDocument)
doc1.page_content = "Content from site 1"
doc1.metadata = {"source": "https://site1.com"}
doc2 = MagicMock(spec=LCDocument)
doc2.page_content = "Content from site 2"
doc2.metadata = {"source": "https://site2.com"}
mock_loader_instance1 = MagicMock()
mock_loader_instance1.load.return_value = [doc1]
mock_loader_instance2 = MagicMock()
mock_loader_instance2.load.return_value = [doc2]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.side_effect = [mock_loader_instance1, mock_loader_instance2]
web_loader.loader = mock_web_base_loader_class
urls = ["https://site1.com", "https://site2.com"]
result = web_loader.load_data(urls)
assert len(result) == 2
assert all(isinstance(doc, Document) for doc in result)
assert result[0].text == "Content from site 1"
assert result[1].text == "Content from site 2"
assert result[0].extra_info == {"source": "https://site1.com"}
assert result[1].extra_info == {"source": "https://site2.com"}
assert mock_web_base_loader_class.call_count == 2
mock_web_base_loader_class.assert_any_call(["https://site1.com"], header_template=headers)
mock_web_base_loader_class.assert_any_call(["https://site2.com"], header_template=headers)
@patch("application.parser.remote.web_loader.validate_url", side_effect=_mock_validate_url)
def test_load_data_url_without_scheme(self, mock_validate, web_loader, mock_langchain_document):
"""Test loading data from URL without scheme (should add http://)."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [mock_langchain_document]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("example.com")
assert len(result) == 1
assert isinstance(result[0], Document)
# Verify WebBaseLoader was called with http:// prefix
mock_web_base_loader_class.assert_called_once_with(["http://example.com"], header_template=headers)
@patch("application.parser.remote.web_loader.validate_url", side_effect=_mock_validate_url)
def test_load_data_url_with_scheme(self, mock_validate, web_loader, mock_langchain_document):
"""Test loading data from URL with scheme (should not modify)."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [mock_langchain_document]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("https://example.com")
assert len(result) == 1
# Verify WebBaseLoader was called with original URL
mock_web_base_loader_class.assert_called_once_with(["https://example.com"], header_template=headers)
@patch("application.parser.remote.web_loader.validate_url", side_effect=_mock_validate_url)
def test_load_data_multiple_documents_per_url(self, mock_validate, web_loader):
"""Test loading multiple documents from a single URL."""
doc1 = MagicMock(spec=LCDocument)
doc1.page_content = "First document content"
doc1.metadata = {"source": "https://example.com", "section": "intro"}
doc2 = MagicMock(spec=LCDocument)
doc2.page_content = "Second document content"
doc2.metadata = {"source": "https://example.com", "section": "main"}
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [doc1, doc2]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("https://example.com")
assert len(result) == 2
assert result[0].text == "First document content"
assert result[1].text == "Second document content"
assert result[0].extra_info == {"source": "https://example.com", "section": "intro"}
assert result[1].extra_info == {"source": "https://example.com", "section": "main"}
class TestWebLoaderErrorHandling:
"""Test WebLoader error handling."""
@patch("application.parser.remote.web_loader.validate_url", side_effect=_mock_validate_url)
@patch('application.parser.remote.web_loader.logging')
def test_load_data_single_url_error(self, mock_logging, mock_validate, web_loader):
"""Test error handling for single URL that fails to load."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.side_effect = Exception("Network error")
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("https://invalid-url.com")
assert result == [] # Should return empty list on error
mock_logging.error.assert_called_once()
error_call = mock_logging.error.call_args
assert "Error processing URL https://invalid-url.com" in error_call[0][0]
assert error_call[1]["exc_info"] is True
@patch("application.parser.remote.web_loader.validate_url", side_effect=_mock_validate_url)
@patch('application.parser.remote.web_loader.logging')
def test_load_data_partial_failure(self, mock_logging, mock_validate, web_loader):
"""Test partial failure - some URLs succeed, some fail."""
doc1 = MagicMock(spec=LCDocument)
doc1.page_content = "Success content"
doc1.metadata = {"source": "https://good-url.com"}
mock_loader_instance1 = MagicMock()
mock_loader_instance1.load.return_value = [doc1]
mock_loader_instance2 = MagicMock()
mock_loader_instance2.load.side_effect = Exception("Network error")
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.side_effect = [mock_loader_instance1, mock_loader_instance2]
web_loader.loader = mock_web_base_loader_class
urls = ["https://good-url.com", "https://bad-url.com"]
result = web_loader.load_data(urls)
assert len(result) == 1 # Only successful URL should be in results
assert result[0].text == "Success content"
assert result[0].extra_info == {"source": "https://good-url.com"}
mock_logging.error.assert_called_once()
error_call = mock_logging.error.call_args
assert "Error processing URL https://bad-url.com" in error_call[0][0]
class TestWebLoaderSSRF:
"""Test WebLoader SSRF protection."""
@patch("application.parser.remote.web_loader.validate_url")
def test_load_data_skips_url_failing_ssrf_validation(self, mock_validate, web_loader):
"""A URL that fails SSRF validation must be skipped, never reaching the fetcher."""
mock_validate.side_effect = SSRFError("Access to private/internal IP addresses is not allowed.")
mock_web_base_loader_class = MagicMock()
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("http://169.254.169.254/latest/meta-data/")
assert result == []
mock_validate.assert_called_once_with("http://169.254.169.254/latest/meta-data/")
mock_web_base_loader_class.assert_not_called()
@patch("application.parser.remote.web_loader.validate_url")
@patch("application.parser.remote.web_loader.logging")
def test_load_data_logs_warning_on_ssrf_failure(self, mock_logging, mock_validate, web_loader):
"""SSRF rejections should be logged as warnings, not errors."""
mock_validate.side_effect = SSRFError("blocked")
web_loader.loader = MagicMock()
web_loader.load_data("http://127.0.0.1")
mock_logging.warning.assert_called_once()
warning_msg = mock_logging.warning.call_args[0][0]
assert "SSRF" in warning_msg
assert "127.0.0.1" in warning_msg
@patch("application.parser.remote.web_loader.validate_url")
def test_load_data_mixed_safe_and_unsafe_urls(self, mock_validate, web_loader):
"""In a list of URLs, unsafe ones are skipped while safe ones still load."""
def validate(url):
if "169.254.169.254" in url:
raise SSRFError("metadata blocked")
return url
mock_validate.side_effect = validate
safe_doc = MagicMock(spec=LCDocument)
safe_doc.page_content = "Public page"
safe_doc.metadata = {"source": "https://example.com"}
safe_loader_instance = MagicMock()
safe_loader_instance.load.return_value = [safe_doc]
mock_web_base_loader_class = MagicMock(return_value=safe_loader_instance)
web_loader.loader = mock_web_base_loader_class
urls = ["https://example.com", "http://169.254.169.254/latest/meta-data/"]
result = web_loader.load_data(urls)
assert len(result) == 1
assert result[0].text == "Public page"
# Only the safe URL should have been fetched
mock_web_base_loader_class.assert_called_once_with(["https://example.com"], header_template=headers)
class TestWebLoaderEdgeCases:
"""Test WebLoader edge cases."""
def test_load_data_empty_list(self, web_loader):
"""Test loading data with empty URL list."""
result = web_loader.load_data([])
assert result == []
@patch("application.parser.remote.web_loader.validate_url", side_effect=_mock_validate_url)
def test_load_data_empty_response(self, mock_validate, web_loader):
"""Test loading data when WebBaseLoader returns empty list."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = []
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_data("https://empty-page.com")
assert result == []
def test_url_scheme_detection(self):
"""Test URL scheme detection logic."""
# Test URLs with schemes
assert urlparse("https://example.com").scheme == "https"
assert urlparse("http://example.com").scheme == "http"
assert urlparse("ftp://example.com").scheme == "ftp"
# Test URLs without schemes
assert urlparse("example.com").scheme == ""
assert urlparse("www.example.com").scheme == ""
class TestWebLoaderIntegration:
"""Test WebLoader integration with base class."""
def test_inherits_from_base_remote(self, web_loader):
"""Test that WebLoader inherits from BaseRemote."""
from application.parser.remote.base import BaseRemote
assert isinstance(web_loader, BaseRemote)
def test_implements_load_data_method(self, web_loader):
"""Test that WebLoader implements required load_data method."""
assert hasattr(web_loader, 'load_data')
assert callable(web_loader.load_data)
@patch("application.parser.remote.web_loader.validate_url", side_effect=_mock_validate_url)
def test_load_langchain_documents_method(self, mock_validate, web_loader, mock_langchain_document):
"""Test inherited load_langchain_documents method."""
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [mock_langchain_document]
mock_web_base_loader_class = MagicMock()
mock_web_base_loader_class.return_value = mock_loader_instance
web_loader.loader = mock_web_base_loader_class
result = web_loader.load_langchain_documents(inputs="https://example.com")
assert len(result) == 1
assert isinstance(result[0], LCDocument)
assert result[0].page_content == "Test web page content"
assert result[0].metadata == {"source": "https://example.com", "title": "Test Page"}