diff --git a/tests/parser/remote/test_crawler_loader.py b/tests/parser/remote/test_crawler_loader.py
new file mode 100644
index 00000000..0a100abb
--- /dev/null
+++ b/tests/parser/remote/test_crawler_loader.py
@@ -0,0 +1,167 @@
+from unittest.mock import MagicMock, patch
+
+from application.parser.remote.crawler_loader import CrawlerLoader
+from application.parser.schema.base import Document
+from langchain.docstore.document import Document as LCDocument
+
+
+class DummyResponse:
+ def __init__(self, text: str) -> None:
+ self.text = text
+
+ def raise_for_status(self) -> None:
+ return None
+
+
+@patch("application.parser.remote.crawler_loader.requests.get")
+def test_load_data_crawls_same_domain_links(mock_requests_get):
+ responses = {
+ "http://example.com": DummyResponse(
+ """
+
+
+ About
+ External
+
+
+ """
+ ),
+ "http://example.com/about": DummyResponse("About page"),
+ }
+
+ def response_side_effect(url: str):
+ if url not in responses:
+ raise AssertionError(f"Unexpected request for URL: {url}")
+ return responses[url]
+
+ mock_requests_get.side_effect = response_side_effect
+
+ root_doc = MagicMock(spec=LCDocument)
+ root_doc.page_content = "Root content"
+ root_doc.metadata = {"source": "http://example.com"}
+
+ about_doc = MagicMock(spec=LCDocument)
+ about_doc.page_content = "About content"
+ about_doc.metadata = {"source": "http://example.com/about"}
+
+ loader_instances = {
+ "http://example.com": MagicMock(),
+ "http://example.com/about": MagicMock(),
+ }
+ loader_instances["http://example.com"].load.return_value = [root_doc]
+ loader_instances["http://example.com/about"].load.return_value = [about_doc]
+
+ loader_call_order = []
+
+ def loader_factory(url_list):
+ url = url_list[0]
+ loader_call_order.append(url)
+ return loader_instances[url]
+
+ crawler = CrawlerLoader(limit=5)
+ crawler.loader = MagicMock(side_effect=loader_factory)
+
+ result = crawler.load_data("http://example.com")
+
+ assert len(result) == 2
+ assert all(isinstance(doc, Document) for doc in result)
+
+ sources = {doc.extra_info.get("source") for doc in result}
+ assert sources == {"http://example.com", "http://example.com/about"}
+
+ texts = {doc.text for doc in result}
+ assert texts == {"Root content", "About content"}
+
+ assert mock_requests_get.call_count == 2
+ assert loader_call_order == ["http://example.com", "http://example.com/about"]
+
+
+@patch("application.parser.remote.crawler_loader.requests.get")
+def test_load_data_accepts_list_input_and_adds_scheme(mock_requests_get):
+ mock_requests_get.return_value = DummyResponse("No links here")
+
+ doc = MagicMock(spec=LCDocument)
+ doc.page_content = "Homepage"
+ doc.metadata = {"source": "http://example.com"}
+
+ loader_instance = MagicMock()
+ loader_instance.load.return_value = [doc]
+
+ crawler = CrawlerLoader()
+ crawler.loader = MagicMock(return_value=loader_instance)
+
+ result = crawler.load_data(["example.com", "unused.com"])
+
+ mock_requests_get.assert_called_once_with("http://example.com")
+ crawler.loader.assert_called_once_with(["http://example.com"])
+
+ assert len(result) == 1
+ assert result[0].text == "Homepage"
+ assert result[0].extra_info == {"source": "http://example.com"}
+
+
+@patch("application.parser.remote.crawler_loader.requests.get")
+def test_load_data_respects_limit(mock_requests_get):
+ responses = {
+ "http://example.com": DummyResponse(
+ """
+
+
+ About
+
+
+ """
+ ),
+ "http://example.com/about": DummyResponse("About"),
+ }
+
+ mock_requests_get.side_effect = lambda url: responses[url]
+
+ root_doc = MagicMock(spec=LCDocument)
+ root_doc.page_content = "Root content"
+ root_doc.metadata = {"source": "http://example.com"}
+
+ about_doc = MagicMock(spec=LCDocument)
+ about_doc.page_content = "About content"
+ about_doc.metadata = {"source": "http://example.com/about"}
+
+ loader_instances = {
+ "http://example.com": MagicMock(),
+ "http://example.com/about": MagicMock(),
+ }
+ loader_instances["http://example.com"].load.return_value = [root_doc]
+ loader_instances["http://example.com/about"].load.return_value = [about_doc]
+
+ crawler = CrawlerLoader(limit=1)
+ crawler.loader = MagicMock(side_effect=lambda url_list: loader_instances[url_list[0]])
+
+ result = crawler.load_data("http://example.com")
+
+ assert len(result) == 1
+ assert result[0].text == "Root content"
+ assert mock_requests_get.call_count == 1
+ assert crawler.loader.call_count == 1
+
+
+@patch("application.parser.remote.crawler_loader.logging")
+@patch("application.parser.remote.crawler_loader.requests.get")
+def test_load_data_logs_and_skips_on_loader_error(mock_requests_get, mock_logging):
+ mock_requests_get.return_value = DummyResponse("Error route")
+
+ failing_loader_instance = MagicMock()
+ failing_loader_instance.load.side_effect = Exception("load failure")
+
+ crawler = CrawlerLoader()
+ crawler.loader = MagicMock(return_value=failing_loader_instance)
+
+ result = crawler.load_data("http://example.com")
+
+ assert result == []
+ mock_requests_get.assert_called_once_with("http://example.com")
+ failing_loader_instance.load.assert_called_once()
+
+ mock_logging.error.assert_called_once()
+ message, = mock_logging.error.call_args.args
+ assert "Error processing URL http://example.com" in message
+ assert mock_logging.error.call_args.kwargs.get("exc_info") is True
+
diff --git a/tests/parser/remote/test_crawler_markdown.py b/tests/parser/remote/test_crawler_markdown.py
new file mode 100644
index 00000000..ac27b3d0
--- /dev/null
+++ b/tests/parser/remote/test_crawler_markdown.py
@@ -0,0 +1,139 @@
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+import requests
+
+from application.parser.remote.crawler_markdown import CrawlerLoader
+from application.parser.schema.base import Document
+
+
+class DummyResponse:
+ def __init__(self, text):
+ self.text = text
+
+ def raise_for_status(self):
+ return None
+
+
+def _fake_extract(value: str) -> SimpleNamespace:
+ value = value.split("//")[-1]
+ host = value.split("/")[0]
+ parts = host.split(".")
+ if len(parts) >= 2:
+ domain = parts[-2]
+ suffix = parts[-1]
+ else:
+ domain = host
+ suffix = ""
+ return SimpleNamespace(domain=domain, suffix=suffix)
+
+
+@pytest.fixture(autouse=True)
+def _patch_tldextract(monkeypatch):
+ monkeypatch.setattr(
+ "application.parser.remote.crawler_markdown.tldextract.extract",
+ _fake_extract,
+ )
+
+
+@pytest.fixture(autouse=True)
+def _patch_markdownify(monkeypatch):
+ outputs = {}
+
+ def fake_markdownify(html, *_, **__):
+ return outputs.get(html, html)
+
+ monkeypatch.setattr(
+ "application.parser.remote.crawler_markdown.markdownify",
+ fake_markdownify,
+ )
+ return outputs
+
+
+def _setup_session(mock_get_side_effect):
+ session = MagicMock()
+ session.get.side_effect = mock_get_side_effect
+ return session
+
+
+def test_load_data_filters_external_links(_patch_markdownify):
+ root_html = """
+ Home
+ AboutOtherWelcome
+
+ """
+ about_html = "AboutAbout page"
+
+ _patch_markdownify[root_html] = "Home Markdown"
+ _patch_markdownify[about_html] = "About Markdown"
+
+ responses = {
+ "http://example.com": DummyResponse(root_html),
+ "http://example.com/about": DummyResponse(about_html),
+ }
+
+ loader = CrawlerLoader(limit=5)
+ loader.session = _setup_session(lambda url, timeout=10: responses[url])
+
+ docs = loader.load_data("http://example.com")
+
+ assert len(docs) == 2
+ for doc in docs:
+ assert isinstance(doc, Document)
+ assert doc.extra_info["source"] in responses
+ texts = {doc.text for doc in docs}
+ assert texts == {"Home Markdown", "About Markdown"}
+
+
+def test_load_data_allows_subdomains(_patch_markdownify):
+ root_html = """
+ Home
+ Blog
+
+ """
+ blog_html = "BlogBlog post"
+
+ _patch_markdownify[root_html] = "Home Markdown"
+ _patch_markdownify[blog_html] = "Blog Markdown"
+
+ responses = {
+ "http://example.com": DummyResponse(root_html),
+ "http://blog.example.com/post": DummyResponse(blog_html),
+ }
+
+ loader = CrawlerLoader(limit=5, allow_subdomains=True)
+ loader.session = _setup_session(lambda url, timeout=10: responses[url])
+
+ docs = loader.load_data("http://example.com")
+
+ sources = {doc.extra_info["source"] for doc in docs}
+ assert "http://blog.example.com/post" in sources
+ assert len(docs) == 2
+
+
+def test_load_data_handles_fetch_errors(monkeypatch, _patch_markdownify):
+ root_html = """
+ Home
+ About
+
+ """
+
+ _patch_markdownify[root_html] = "Home Markdown"
+
+ def side_effect(url, timeout=10):
+ if url == "http://example.com":
+ return DummyResponse(root_html)
+ raise requests.exceptions.RequestException("boom")
+
+ loader = CrawlerLoader(limit=5)
+ loader.session = _setup_session(side_effect)
+ mock_print = MagicMock()
+ monkeypatch.setattr("builtins.print", mock_print)
+
+ docs = loader.load_data("http://example.com")
+
+ assert len(docs) == 1
+ assert docs[0].text == "Home Markdown"
+ assert mock_print.called
+
diff --git a/tests/parser/remote/test_web_loader.py b/tests/parser/remote/test_web_loader.py
new file mode 100644
index 00000000..ca539f0a
--- /dev/null
+++ b/tests/parser/remote/test_web_loader.py
@@ -0,0 +1,303 @@
+import pytest
+from unittest.mock import patch, MagicMock
+from urllib.parse import urlparse
+
+from application.parser.remote.web_loader import WebLoader, headers
+from application.parser.schema.base import Document
+from langchain.docstore.document import Document as LCDocument
+
+
+@pytest.fixture
+def web_loader():
+ return WebLoader()
+
+
+@pytest.fixture
+def mock_langchain_document():
+ """Create a mock LangChain document."""
+ doc = MagicMock(spec=LCDocument)
+ doc.page_content = "Test web page content"
+ doc.metadata = {"source": "https://example.com", "title": "Test Page"}
+ return doc
+
+
+@pytest.fixture
+def mock_web_base_loader():
+ """Create a mock WebBaseLoader class."""
+ mock_loader_class = MagicMock()
+ mock_loader_instance = MagicMock()
+ mock_loader_class.return_value = mock_loader_instance
+ return mock_loader_class, mock_loader_instance
+
+
+class TestWebLoaderInitialization:
+ """Test WebLoader initialization."""
+
+ def test_init(self, web_loader):
+ """Test WebLoader initialization."""
+ assert web_loader.loader is not None
+ from langchain_community.document_loaders import WebBaseLoader
+ assert web_loader.loader == WebBaseLoader
+
+
+class TestWebLoaderHeaders:
+ """Test WebLoader headers configuration."""
+
+ def test_headers_defined(self):
+ """Test that headers are properly defined."""
+ assert isinstance(headers, dict)
+ assert "User-Agent" in headers
+ assert "Accept" in headers
+ assert "Accept-Language" in headers
+ assert "Referer" in headers
+ assert "DNT" in headers
+ assert "Connection" in headers
+ assert "Upgrade-Insecure-Requests" in headers
+
+ def test_headers_values(self):
+ """Test header values are reasonable."""
+ assert headers["User-Agent"] == "Mozilla/5.0"
+ assert "text/html" in headers["Accept"]
+ assert headers["Referer"] == "https://www.google.com/"
+ assert headers["DNT"] == "1"
+ assert headers["Connection"] == "keep-alive"
+
+
+class TestWebLoaderLoadData:
+ """Test WebLoader load_data method."""
+
+ def test_load_data_single_url_string(self, web_loader, mock_langchain_document):
+ """Test loading data from a single URL passed as string."""
+
+ mock_loader_instance = MagicMock()
+ mock_loader_instance.load.return_value = [mock_langchain_document]
+
+ mock_web_base_loader_class = MagicMock()
+ mock_web_base_loader_class.return_value = mock_loader_instance
+
+ web_loader.loader = mock_web_base_loader_class
+
+ result = web_loader.load_data("https://example.com")
+
+ assert len(result) == 1
+ assert isinstance(result[0], Document)
+ assert result[0].text == "Test web page content"
+ assert result[0].extra_info == {"source": "https://example.com", "title": "Test Page"}
+
+ mock_web_base_loader_class.assert_called_once_with(["https://example.com"], header_template=headers)
+ mock_loader_instance.load.assert_called_once()
+
+ def test_load_data_multiple_urls_list(self, web_loader):
+ """Test loading data from multiple URLs passed as list."""
+
+ doc1 = MagicMock(spec=LCDocument)
+ doc1.page_content = "Content from site 1"
+ doc1.metadata = {"source": "https://site1.com"}
+
+ doc2 = MagicMock(spec=LCDocument)
+ doc2.page_content = "Content from site 2"
+ doc2.metadata = {"source": "https://site2.com"}
+
+
+ mock_loader_instance1 = MagicMock()
+ mock_loader_instance1.load.return_value = [doc1]
+
+ mock_loader_instance2 = MagicMock()
+ mock_loader_instance2.load.return_value = [doc2]
+
+ mock_web_base_loader_class = MagicMock()
+ mock_web_base_loader_class.side_effect = [mock_loader_instance1, mock_loader_instance2]
+
+ web_loader.loader = mock_web_base_loader_class
+
+ urls = ["https://site1.com", "https://site2.com"]
+ result = web_loader.load_data(urls)
+
+ assert len(result) == 2
+ assert all(isinstance(doc, Document) for doc in result)
+ assert result[0].text == "Content from site 1"
+ assert result[1].text == "Content from site 2"
+ assert result[0].extra_info == {"source": "https://site1.com"}
+ assert result[1].extra_info == {"source": "https://site2.com"}
+
+ assert mock_web_base_loader_class.call_count == 2
+ mock_web_base_loader_class.assert_any_call(["https://site1.com"], header_template=headers)
+ mock_web_base_loader_class.assert_any_call(["https://site2.com"], header_template=headers)
+
+ def test_load_data_url_without_scheme(self, web_loader, mock_langchain_document):
+ """Test loading data from URL without scheme (should add http://)."""
+ mock_loader_instance = MagicMock()
+ mock_loader_instance.load.return_value = [mock_langchain_document]
+
+ mock_web_base_loader_class = MagicMock()
+ mock_web_base_loader_class.return_value = mock_loader_instance
+
+ web_loader.loader = mock_web_base_loader_class
+
+ result = web_loader.load_data("example.com")
+
+ assert len(result) == 1
+ assert isinstance(result[0], Document)
+
+ # Verify WebBaseLoader was called with http:// prefix
+ mock_web_base_loader_class.assert_called_once_with(["http://example.com"], header_template=headers)
+
+ def test_load_data_url_with_scheme(self, web_loader, mock_langchain_document):
+ """Test loading data from URL with scheme (should not modify)."""
+ mock_loader_instance = MagicMock()
+ mock_loader_instance.load.return_value = [mock_langchain_document]
+
+ mock_web_base_loader_class = MagicMock()
+ mock_web_base_loader_class.return_value = mock_loader_instance
+
+ web_loader.loader = mock_web_base_loader_class
+
+ result = web_loader.load_data("https://example.com")
+
+ assert len(result) == 1
+
+ # Verify WebBaseLoader was called with original URL
+ mock_web_base_loader_class.assert_called_once_with(["https://example.com"], header_template=headers)
+
+ def test_load_data_multiple_documents_per_url(self, web_loader):
+ """Test loading multiple documents from a single URL."""
+ doc1 = MagicMock(spec=LCDocument)
+ doc1.page_content = "First document content"
+ doc1.metadata = {"source": "https://example.com", "section": "intro"}
+
+ doc2 = MagicMock(spec=LCDocument)
+ doc2.page_content = "Second document content"
+ doc2.metadata = {"source": "https://example.com", "section": "main"}
+
+ mock_loader_instance = MagicMock()
+ mock_loader_instance.load.return_value = [doc1, doc2]
+
+ mock_web_base_loader_class = MagicMock()
+ mock_web_base_loader_class.return_value = mock_loader_instance
+
+ web_loader.loader = mock_web_base_loader_class
+
+ result = web_loader.load_data("https://example.com")
+
+ assert len(result) == 2
+ assert result[0].text == "First document content"
+ assert result[1].text == "Second document content"
+ assert result[0].extra_info == {"source": "https://example.com", "section": "intro"}
+ assert result[1].extra_info == {"source": "https://example.com", "section": "main"}
+
+
+class TestWebLoaderErrorHandling:
+ """Test WebLoader error handling."""
+
+ @patch('application.parser.remote.web_loader.logging')
+ def test_load_data_single_url_error(self, mock_logging, web_loader):
+ """Test error handling for single URL that fails to load."""
+ mock_loader_instance = MagicMock()
+ mock_loader_instance.load.side_effect = Exception("Network error")
+
+ mock_web_base_loader_class = MagicMock()
+ mock_web_base_loader_class.return_value = mock_loader_instance
+
+ web_loader.loader = mock_web_base_loader_class
+
+ result = web_loader.load_data("https://invalid-url.com")
+
+ assert result == [] # Should return empty list on error
+ mock_logging.error.assert_called_once()
+ error_call = mock_logging.error.call_args
+ assert "Error processing URL https://invalid-url.com" in error_call[0][0]
+ assert error_call[1]["exc_info"] is True
+
+ @patch('application.parser.remote.web_loader.logging')
+ def test_load_data_partial_failure(self, mock_logging, web_loader):
+ """Test partial failure - some URLs succeed, some fail."""
+ doc1 = MagicMock(spec=LCDocument)
+ doc1.page_content = "Success content"
+ doc1.metadata = {"source": "https://good-url.com"}
+
+ mock_loader_instance1 = MagicMock()
+ mock_loader_instance1.load.return_value = [doc1]
+
+ mock_loader_instance2 = MagicMock()
+ mock_loader_instance2.load.side_effect = Exception("Network error")
+
+ mock_web_base_loader_class = MagicMock()
+ mock_web_base_loader_class.side_effect = [mock_loader_instance1, mock_loader_instance2]
+
+ web_loader.loader = mock_web_base_loader_class
+
+ urls = ["https://good-url.com", "https://bad-url.com"]
+ result = web_loader.load_data(urls)
+
+ assert len(result) == 1 # Only successful URL should be in results
+ assert result[0].text == "Success content"
+ assert result[0].extra_info == {"source": "https://good-url.com"}
+
+ mock_logging.error.assert_called_once()
+ error_call = mock_logging.error.call_args
+ assert "Error processing URL https://bad-url.com" in error_call[0][0]
+
+
+class TestWebLoaderEdgeCases:
+ """Test WebLoader edge cases."""
+
+ def test_load_data_empty_list(self, web_loader):
+ """Test loading data with empty URL list."""
+ result = web_loader.load_data([])
+ assert result == []
+
+ def test_load_data_empty_response(self, web_loader):
+ """Test loading data when WebBaseLoader returns empty list."""
+ mock_loader_instance = MagicMock()
+ mock_loader_instance.load.return_value = []
+
+ mock_web_base_loader_class = MagicMock()
+ mock_web_base_loader_class.return_value = mock_loader_instance
+
+ web_loader.loader = mock_web_base_loader_class
+
+ result = web_loader.load_data("https://empty-page.com")
+
+ assert result == []
+
+ def test_url_scheme_detection(self):
+ """Test URL scheme detection logic."""
+ # Test URLs with schemes
+ assert urlparse("https://example.com").scheme == "https"
+ assert urlparse("http://example.com").scheme == "http"
+ assert urlparse("ftp://example.com").scheme == "ftp"
+
+ # Test URLs without schemes
+ assert urlparse("example.com").scheme == ""
+ assert urlparse("www.example.com").scheme == ""
+
+
+class TestWebLoaderIntegration:
+ """Test WebLoader integration with base class."""
+
+ def test_inherits_from_base_remote(self, web_loader):
+ """Test that WebLoader inherits from BaseRemote."""
+ from application.parser.remote.base import BaseRemote
+ assert isinstance(web_loader, BaseRemote)
+
+ def test_implements_load_data_method(self, web_loader):
+ """Test that WebLoader implements required load_data method."""
+ assert hasattr(web_loader, 'load_data')
+ assert callable(web_loader.load_data)
+
+ def test_load_langchain_documents_method(self, web_loader, mock_langchain_document):
+ """Test inherited load_langchain_documents method."""
+ mock_loader_instance = MagicMock()
+ mock_loader_instance.load.return_value = [mock_langchain_document]
+
+ mock_web_base_loader_class = MagicMock()
+ mock_web_base_loader_class.return_value = mock_loader_instance
+
+ web_loader.loader = mock_web_base_loader_class
+
+ result = web_loader.load_langchain_documents(inputs="https://example.com")
+
+ assert len(result) == 1
+ assert isinstance(result[0], LCDocument)
+ assert result[0].page_content == "Test web page content"
+ assert result[0].metadata == {"source": "https://example.com", "title": "Test Page"}