From e24a0ac68655bf90100bb7022c01b08c7b1fe930 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Mon, 29 Sep 2025 20:33:05 +0530 Subject: [PATCH] (test:parsers) github, reddit --- tests/parser/remote/test_github_loader.py | 159 ++++++++++++++++++++++ tests/parser/remote/test_reddit_loader.py | 83 +++++++++++ 2 files changed, 242 insertions(+) create mode 100644 tests/parser/remote/test_github_loader.py create mode 100644 tests/parser/remote/test_reddit_loader.py diff --git a/tests/parser/remote/test_github_loader.py b/tests/parser/remote/test_github_loader.py new file mode 100644 index 00000000..6bb3ed2e --- /dev/null +++ b/tests/parser/remote/test_github_loader.py @@ -0,0 +1,159 @@ +import base64 +import pytest +from unittest.mock import patch, MagicMock +import requests + +from application.parser.remote.github_loader import GitHubLoader + + +def make_response(json_data=None, status_code=200, raise_error=None): + resp = MagicMock() + resp.status_code = status_code + resp.json.return_value = json_data + if raise_error is not None: + resp.raise_for_status.side_effect = raise_error + else: + resp.raise_for_status.return_value = None + return resp + + +class TestGitHubLoaderFetchFileContent: + @patch("application.parser.remote.github_loader.requests.get") + def test_text_file_base64_decoded(self, mock_get): + loader = GitHubLoader() + content_str = "Hello from README" + b64 = base64.b64encode(content_str.encode("utf-8")).decode("utf-8") + mock_get.return_value = make_response({"encoding": "base64", "content": b64}) + + result = loader.fetch_file_content("owner/repo", "README.md") + + assert result == f"Filename: README.md\n\n{content_str}" + mock_get.assert_called_once_with( + "https://api.github.com/repos/owner/repo/contents/README.md", + headers=loader.headers, + ) + + @patch("application.parser.remote.github_loader.requests.get") + def test_binary_file_skipped(self, mock_get): + loader = GitHubLoader() + mock_get.return_value = make_response({"encoding": "base64", "content": "AAAA"}) + + result = loader.fetch_file_content("owner/repo", "image.png") + + assert result == "Filename: image.png is a binary file and was skipped." + + @patch("application.parser.remote.github_loader.requests.get") + def test_non_base64_plain_content(self, mock_get): + loader = GitHubLoader() + mock_get.return_value = make_response({"encoding": "", "content": "Plain text"}) + + result = loader.fetch_file_content("owner/repo", "file.txt") + + assert result == "Filename: file.txt\n\nPlain text" + + @patch("application.parser.remote.github_loader.requests.get") + def test_http_error_raises(self, mock_get): + loader = GitHubLoader() + http_err = requests.HTTPError("Not found") + mock_get.return_value = make_response(status_code=404, raise_error=http_err) + + with pytest.raises(requests.HTTPError): + loader.fetch_file_content("owner/repo", "missing.txt") + + +class TestGitHubLoaderFetchRepoFiles: + @patch("application.parser.remote.github_loader.requests.get") + def test_recurses_directories(self, mock_get): + loader = GitHubLoader() + + def side_effect(url, headers=None): + if url.endswith("/contents/"): + return make_response([ + {"type": "file", "path": "README.md"}, + {"type": "dir", "path": "src"}, + ]) + elif url.endswith("/contents/src"): + return make_response([ + {"type": "file", "path": "src/main.py"}, + {"type": "file", "path": "src/util.py"}, + ]) + raise AssertionError(f"Unexpected URL: {url}") + + mock_get.side_effect = side_effect + + files = loader.fetch_repo_files("owner/repo", path="") + assert set(files) == {"README.md", "src/main.py", "src/util.py"} + + +class TestGitHubLoaderLoadData: + def test_load_data_builds_documents_from_files(self, monkeypatch): + loader = GitHubLoader() + + # Stub out network-dependent methods + monkeypatch.setattr(loader, "fetch_repo_files", lambda repo, path="": [ + "README.md", "src/main.py" + ]) + + def fake_fetch_content(repo, file_path): + return f"content for {file_path}" + + monkeypatch.setattr(loader, "fetch_file_content", fake_fetch_content) + + docs = loader.load_data("https://github.com/owner/repo") + + assert len(docs) == 2 + assert docs[0].page_content == "content for README.md" + assert docs[0].metadata == { + "title": "README.md", + "source": "https://github.com/owner/repo/blob/main/README.md", + } + assert docs[1].page_content == "content for src/main.py" + assert docs[1].metadata == { + "title": "src/main.py", + "source": "https://github.com/owner/repo/blob/main/src/main.py", + } + + + + +class TestGitHubLoaderRobustness: + @patch("application.parser.remote.github_loader.requests.get") + def test_fetch_repo_files_non_json_raises(self, mock_get): + resp = MagicMock() + resp.json.side_effect = ValueError("No JSON") + mock_get.return_value = resp + with pytest.raises(ValueError): + GitHubLoader().fetch_repo_files("owner/repo") + + @patch("application.parser.remote.github_loader.requests.get") + def test_fetch_repo_files_unexpected_shape_missing_type_raises(self, mock_get): + # Missing 'type' in items should raise KeyError when accessed + mock_get.return_value = make_response([{"path": "README.md"}]) + with pytest.raises(KeyError): + GitHubLoader().fetch_repo_files("owner/repo") + + @patch("application.parser.remote.github_loader.requests.get") + def test_fetch_file_content_non_json_raises(self, mock_get): + resp = MagicMock() + resp.status_code = 200 + resp.json.side_effect = ValueError("No JSON") + mock_get.return_value = resp + with pytest.raises(ValueError): + GitHubLoader().fetch_file_content("owner/repo", "README.md") + + @patch("application.parser.remote.github_loader.requests.get") + def test_fetch_file_content_unexpected_shape_missing_content_raises(self, mock_get): + # encoding indicates base64 text, but 'content' key is missing + resp = make_response({"encoding": "base64"}) + mock_get.return_value = resp + with pytest.raises(KeyError): + GitHubLoader().fetch_file_content("owner/repo", "README.md") + + @patch("application.parser.remote.github_loader.base64.b64decode") + @patch("application.parser.remote.github_loader.requests.get") + def test_large_binary_skip_does_not_decode(self, mock_get, mock_b64decode): + # Ensure we don't attempt to decode large binary content for non-text files + mock_b64decode.side_effect = AssertionError("b64decode should not be called for binary files") + mock_get.return_value = make_response({"encoding": "base64", "content": "AAA"}) + result = GitHubLoader().fetch_file_content("owner/repo", "bigfile.bin") + assert result == "Filename: bigfile.bin is a binary file and was skipped." diff --git a/tests/parser/remote/test_reddit_loader.py b/tests/parser/remote/test_reddit_loader.py new file mode 100644 index 00000000..26199490 --- /dev/null +++ b/tests/parser/remote/test_reddit_loader.py @@ -0,0 +1,83 @@ +import json +from unittest.mock import patch, MagicMock +import pytest + +from application.parser.remote.reddit_loader import RedditPostsLoaderRemote + + +class TestRedditPostsLoaderRemote: + def test_invalid_json_raises(self): + loader = RedditPostsLoaderRemote() + with pytest.raises(ValueError) as exc: + loader.load_data("not a json") + assert "Invalid JSON input" in str(exc.value) + + def test_missing_required_fields_raises(self): + loader = RedditPostsLoaderRemote() + payload = json.dumps({"client_id": "id"}) + with pytest.raises(ValueError) as exc: + loader.load_data(payload) + assert "Missing required fields" in str(exc.value) + assert "client_secret" in str(exc.value) + + @patch("application.parser.remote.reddit_loader.RedditPostsLoader") + def test_constructs_loader_and_loads_with_defaults(self, MockRedditLoader): + loader = RedditPostsLoaderRemote() + + instance = MagicMock() + docs = [MagicMock(), MagicMock()] + instance.load.return_value = docs + MockRedditLoader.return_value = instance + + payload = { + "client_id": "cid", + "client_secret": "csecret", + "user_agent": "ua", + "search_queries": ["r/langchain"], + } + + result = loader.load_data(json.dumps(payload)) + + MockRedditLoader.assert_called_once_with( + client_id="cid", + client_secret="csecret", + user_agent="ua", + categories=["new", "hot"], + mode="subreddit", + search_queries=["r/langchain"], + number_posts=10, + ) + instance.load.assert_called_once() + assert result == docs + + @patch("application.parser.remote.reddit_loader.RedditPostsLoader") + def test_constructs_loader_and_loads_with_overrides(self, MockRedditLoader): + loader = RedditPostsLoaderRemote() + + instance = MagicMock() + instance.load.return_value = [] + MockRedditLoader.return_value = instance + + payload = { + "client_id": "cid", + "client_secret": "csecret", + "user_agent": "ua", + "search_queries": ["python"], + "categories": ["hot"], + "mode": "comments", + "number_posts": 3, + } + + loader.load_data(json.dumps(payload)) + + MockRedditLoader.assert_called_once_with( + client_id="cid", + client_secret="csecret", + user_agent="ua", + categories=["hot"], + mode="comments", + search_queries=["python"], + number_posts=3, + ) + instance.load.assert_called_once() +