mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-05-07 06:30:03 +00:00
315 lines
12 KiB
Python
315 lines
12 KiB
Python
import base64
|
|
import pytest
|
|
from unittest.mock import patch, MagicMock
|
|
import requests
|
|
|
|
from application.parser.remote.github_loader import GitHubLoader
|
|
|
|
|
|
def make_response(json_data=None, status_code=200, raise_error=None):
|
|
resp = MagicMock()
|
|
resp.status_code = status_code
|
|
resp.json.return_value = json_data
|
|
if raise_error is not None:
|
|
resp.raise_for_status.side_effect = raise_error
|
|
else:
|
|
resp.raise_for_status.return_value = None
|
|
return resp
|
|
|
|
|
|
class TestGitHubLoaderFetchFileContent:
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_text_file_base64_decoded(self, mock_get):
|
|
loader = GitHubLoader()
|
|
content_str = "Hello from README"
|
|
b64 = base64.b64encode(content_str.encode("utf-8")).decode("utf-8")
|
|
mock_get.return_value = make_response({"encoding": "base64", "content": b64})
|
|
|
|
result = loader.fetch_file_content("owner/repo", "README.md")
|
|
|
|
assert result == content_str
|
|
mock_get.assert_called_once_with(
|
|
"https://api.github.com/repos/owner/repo/contents/README.md",
|
|
headers=loader.headers,
|
|
timeout=100,
|
|
)
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_binary_file_skipped(self, mock_get):
|
|
loader = GitHubLoader()
|
|
mock_get.return_value = make_response({"encoding": "base64", "content": "AAAA"})
|
|
|
|
result = loader.fetch_file_content("owner/repo", "image.png")
|
|
|
|
assert result is None
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_non_base64_plain_content(self, mock_get):
|
|
loader = GitHubLoader()
|
|
mock_get.return_value = make_response({"encoding": "", "content": "Plain text"})
|
|
|
|
result = loader.fetch_file_content("owner/repo", "file.txt")
|
|
|
|
assert result == "Plain text"
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_http_error_raises(self, mock_get):
|
|
loader = GitHubLoader()
|
|
http_err = requests.HTTPError("Not found")
|
|
mock_get.return_value = make_response(status_code=404, raise_error=http_err)
|
|
|
|
with pytest.raises(requests.HTTPError):
|
|
loader.fetch_file_content("owner/repo", "missing.txt")
|
|
|
|
|
|
class TestGitHubLoaderFetchRepoFiles:
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_recurses_directories(self, mock_get):
|
|
loader = GitHubLoader()
|
|
|
|
def side_effect(url, headers=None, timeout=None):
|
|
if url.endswith("/contents/"):
|
|
return make_response([
|
|
{"type": "file", "path": "README.md"},
|
|
{"type": "dir", "path": "src"},
|
|
])
|
|
elif url.endswith("/contents/src"):
|
|
return make_response([
|
|
{"type": "file", "path": "src/main.py"},
|
|
{"type": "file", "path": "src/util.py"},
|
|
])
|
|
raise AssertionError(f"Unexpected URL: {url}")
|
|
|
|
mock_get.side_effect = side_effect
|
|
|
|
files = loader.fetch_repo_files("owner/repo", path="")
|
|
assert set(files) == {"README.md", "src/main.py", "src/util.py"}
|
|
|
|
|
|
class TestGitHubLoaderLoadData:
|
|
def test_load_data_builds_documents_from_files(self, monkeypatch):
|
|
loader = GitHubLoader()
|
|
|
|
# Stub out network-dependent methods
|
|
monkeypatch.setattr(loader, "fetch_repo_files", lambda repo, path="": [
|
|
"README.md", "src/main.py"
|
|
])
|
|
|
|
def fake_fetch_content(repo, file_path):
|
|
return f"content for {file_path}"
|
|
|
|
monkeypatch.setattr(loader, "fetch_file_content", fake_fetch_content)
|
|
|
|
docs = loader.load_data("https://github.com/owner/repo")
|
|
|
|
assert len(docs) == 2
|
|
assert docs[0].text == "content for README.md"
|
|
assert docs[0].extra_info == {
|
|
"title": "README.md",
|
|
"source": "https://github.com/owner/repo/blob/main/README.md",
|
|
}
|
|
assert docs[1].text == "content for src/main.py"
|
|
assert docs[1].extra_info == {
|
|
"title": "src/main.py",
|
|
"source": "https://github.com/owner/repo/blob/main/src/main.py",
|
|
}
|
|
|
|
|
|
|
|
|
|
class TestGitHubLoaderIsTextFile:
|
|
def test_known_extension(self):
|
|
loader = GitHubLoader()
|
|
assert loader.is_text_file("app.py") is True
|
|
assert loader.is_text_file("data.json") is True
|
|
|
|
def test_unknown_extension_with_text_mime(self):
|
|
loader = GitHubLoader()
|
|
assert loader.is_text_file("file.xml") is True
|
|
|
|
def test_binary_file(self):
|
|
loader = GitHubLoader()
|
|
assert loader.is_text_file("image.png") is False
|
|
|
|
@patch("application.parser.remote.github_loader.mimetypes.guess_type")
|
|
def test_mime_fallback_text(self, mock_mime):
|
|
mock_mime.return_value = ("text/plain", None)
|
|
loader = GitHubLoader()
|
|
assert loader.is_text_file("unknownfile.xyz") is True
|
|
|
|
|
|
class TestGitHubLoaderMakeRequest:
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_success(self, mock_get):
|
|
loader = GitHubLoader()
|
|
mock_get.return_value = make_response({"ok": True}, 200)
|
|
resp = loader._make_request("http://example.com")
|
|
assert resp.status_code == 200
|
|
|
|
@patch("application.parser.remote.github_loader.time.sleep")
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_rate_limit_retry(self, mock_get, mock_sleep):
|
|
loader = GitHubLoader()
|
|
rate_resp = MagicMock()
|
|
rate_resp.status_code = 403
|
|
rate_resp.json.return_value = {"message": "API rate limit exceeded"}
|
|
rate_resp.headers = {
|
|
"X-RateLimit-Remaining": "0",
|
|
"X-RateLimit-Reset": "9999999",
|
|
}
|
|
ok_resp = make_response({"ok": True}, 200)
|
|
mock_get.side_effect = [rate_resp, ok_resp]
|
|
|
|
resp = loader._make_request("http://example.com", max_retries=2)
|
|
assert resp.status_code == 200
|
|
mock_sleep.assert_called_once()
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_rate_limit_exhausted(self, mock_get):
|
|
loader = GitHubLoader()
|
|
rate_resp = MagicMock()
|
|
rate_resp.status_code = 403
|
|
rate_resp.json.return_value = {"message": "API rate limit exceeded"}
|
|
rate_resp.headers = {
|
|
"X-RateLimit-Remaining": "0",
|
|
"X-RateLimit-Reset": "9999",
|
|
}
|
|
mock_get.return_value = rate_resp
|
|
|
|
with pytest.raises(Exception, match="rate limit exceeded"):
|
|
loader._make_request("http://example.com", max_retries=1)
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_403_non_rate_limit(self, mock_get):
|
|
loader = GitHubLoader()
|
|
resp = MagicMock()
|
|
resp.status_code = 403
|
|
resp.json.return_value = {"message": "Forbidden - need auth"}
|
|
resp.headers = {"X-RateLimit-Remaining": "50", "X-RateLimit-Reset": "9999"}
|
|
mock_get.return_value = resp
|
|
|
|
with pytest.raises(Exception, match="GitHub API error"):
|
|
loader._make_request("http://example.com", max_retries=1)
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_other_error_raises(self, mock_get):
|
|
loader = GitHubLoader()
|
|
resp = make_response(
|
|
status_code=500,
|
|
raise_error=requests.HTTPError("Server Error"),
|
|
)
|
|
mock_get.return_value = resp
|
|
|
|
with pytest.raises(requests.HTTPError):
|
|
loader._make_request("http://example.com", max_retries=1)
|
|
|
|
|
|
class TestGitHubLoaderFetchRepoFilesErrors:
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_api_error_message_in_dict(self, mock_get):
|
|
loader = GitHubLoader()
|
|
mock_get.return_value = make_response(
|
|
{"message": "Not Found"}, 200
|
|
)
|
|
|
|
with pytest.raises(Exception, match="GitHub API error"):
|
|
loader.fetch_repo_files("owner/repo")
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_non_list_response(self, mock_get):
|
|
loader = GitHubLoader()
|
|
mock_get.return_value = make_response("not a list", 200)
|
|
|
|
with pytest.raises(TypeError, match="Expected list"):
|
|
loader.fetch_repo_files("owner/repo")
|
|
|
|
|
|
class TestGitHubLoaderFetchFileContentEdgeCases:
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_empty_base64_text_returns_none(self, mock_get):
|
|
loader = GitHubLoader()
|
|
b64 = base64.b64encode(b"").decode("utf-8")
|
|
mock_get.return_value = make_response(
|
|
{"encoding": "base64", "content": b64}
|
|
)
|
|
result = loader.fetch_file_content("owner/repo", "empty.py")
|
|
assert result is None
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_empty_non_base64_returns_none(self, mock_get):
|
|
loader = GitHubLoader()
|
|
mock_get.return_value = make_response(
|
|
{"encoding": "none", "content": " "}
|
|
)
|
|
result = loader.fetch_file_content("owner/repo", "empty.txt")
|
|
assert result is None
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_decode_failure_returns_none(self, mock_get):
|
|
loader = GitHubLoader()
|
|
mock_get.return_value = make_response(
|
|
{"encoding": "base64", "content": "invalid!!base64"}
|
|
)
|
|
result = loader.fetch_file_content("owner/repo", "broken.py")
|
|
assert result is None
|
|
|
|
|
|
class TestGitHubLoaderLoadDataSkipsNone:
|
|
def test_skips_binary_files(self, monkeypatch):
|
|
loader = GitHubLoader()
|
|
monkeypatch.setattr(
|
|
loader, "fetch_repo_files", lambda repo, path="": ["a.py", "b.png"]
|
|
)
|
|
|
|
def fake_content(repo, fp):
|
|
return "code" if fp == "a.py" else None
|
|
|
|
monkeypatch.setattr(loader, "fetch_file_content", fake_content)
|
|
docs = loader.load_data("https://github.com/o/r")
|
|
assert len(docs) == 1
|
|
assert docs[0].doc_id == "a.py"
|
|
|
|
|
|
class TestGitHubLoaderRobustness:
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_fetch_repo_files_non_json_raises(self, mock_get):
|
|
resp = MagicMock()
|
|
resp.json.side_effect = ValueError("No JSON")
|
|
mock_get.return_value = resp
|
|
with pytest.raises(ValueError):
|
|
GitHubLoader().fetch_repo_files("owner/repo")
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_fetch_repo_files_unexpected_shape_missing_type_raises(self, mock_get):
|
|
# Missing 'type' in items should raise KeyError when accessed
|
|
mock_get.return_value = make_response([{"path": "README.md"}])
|
|
with pytest.raises(KeyError):
|
|
GitHubLoader().fetch_repo_files("owner/repo")
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_fetch_file_content_non_json_raises(self, mock_get):
|
|
resp = MagicMock()
|
|
resp.status_code = 200
|
|
resp.json.side_effect = ValueError("No JSON")
|
|
mock_get.return_value = resp
|
|
with pytest.raises(ValueError):
|
|
GitHubLoader().fetch_file_content("owner/repo", "README.md")
|
|
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_fetch_file_content_unexpected_shape_missing_content_returns_none(self, mock_get):
|
|
# encoding indicates base64 text, but 'content' key is missing
|
|
# With the new code, the exception is caught and returns None (treated as binary/skipped)
|
|
resp = make_response({"encoding": "base64"})
|
|
mock_get.return_value = resp
|
|
result = GitHubLoader().fetch_file_content("owner/repo", "file.txt")
|
|
assert result is None
|
|
|
|
@patch("application.parser.remote.github_loader.base64.b64decode")
|
|
@patch("application.parser.remote.github_loader.requests.get")
|
|
def test_large_binary_skip_does_not_decode(self, mock_get, mock_b64decode):
|
|
# Ensure we don't attempt to decode large binary content for non-text files
|
|
mock_b64decode.side_effect = AssertionError("b64decode should not be called for binary files")
|
|
mock_get.return_value = make_response({"encoding": "base64", "content": "AAA"})
|
|
result = GitHubLoader().fetch_file_content("owner/repo", "bigfile.bin")
|
|
assert result is None
|