mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 00:23:17 +00:00
Compare commits
1 Commits
dependabot
...
github-fix
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2db33b2d82 |
@@ -51,6 +51,9 @@ class Settings(BaseSettings):
|
||||
"http://127.0.0.1:7091/api/connectors/callback" ##add redirect url as it is to your provider's console(gcp)
|
||||
)
|
||||
|
||||
# GitHub source
|
||||
GITHUB_ACCESS_TOKEN: Optional[str] = None # PAT token with read repo access
|
||||
|
||||
# LLM Cache
|
||||
CACHE_REDIS_URL: str = "redis://localhost:6379/2"
|
||||
|
||||
|
||||
@@ -1,44 +1,135 @@
|
||||
import base64
|
||||
import requests
|
||||
from typing import List
|
||||
import time
|
||||
from typing import List, Optional
|
||||
from application.parser.remote.base import BaseRemote
|
||||
from langchain_core.documents import Document
|
||||
from application.parser.schema.base import Document
|
||||
import mimetypes
|
||||
from application.core.settings import settings
|
||||
|
||||
class GitHubLoader(BaseRemote):
|
||||
def __init__(self):
|
||||
self.access_token = None
|
||||
self.access_token = settings.GITHUB_ACCESS_TOKEN
|
||||
self.headers = {
|
||||
"Authorization": f"token {self.access_token}"
|
||||
} if self.access_token else {}
|
||||
"Authorization": f"token {self.access_token}",
|
||||
"Accept": "application/vnd.github.v3+json"
|
||||
} if self.access_token else {
|
||||
"Accept": "application/vnd.github.v3+json"
|
||||
}
|
||||
return
|
||||
|
||||
def fetch_file_content(self, repo_url: str, file_path: str) -> str:
|
||||
def is_text_file(self, file_path: str) -> bool:
|
||||
"""Determine if a file is a text file based on extension."""
|
||||
# Common text file extensions
|
||||
text_extensions = {
|
||||
'.txt', '.md', '.markdown', '.rst', '.json', '.xml', '.yaml', '.yml',
|
||||
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.c', '.cpp', '.h', '.hpp',
|
||||
'.cs', '.go', '.rs', '.rb', '.php', '.swift', '.kt', '.scala',
|
||||
'.html', '.css', '.scss', '.sass', '.less',
|
||||
'.sh', '.bash', '.zsh', '.fish',
|
||||
'.sql', '.r', '.m', '.mat',
|
||||
'.ini', '.cfg', '.conf', '.config', '.env',
|
||||
'.gitignore', '.dockerignore', '.editorconfig',
|
||||
'.log', '.csv', '.tsv'
|
||||
}
|
||||
|
||||
# Get file extension
|
||||
file_lower = file_path.lower()
|
||||
for ext in text_extensions:
|
||||
if file_lower.endswith(ext):
|
||||
return True
|
||||
|
||||
# Also check MIME type
|
||||
mime_type, _ = mimetypes.guess_type(file_path)
|
||||
if mime_type and (mime_type.startswith("text") or mime_type in ["application/json", "application/xml"]):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def fetch_file_content(self, repo_url: str, file_path: str) -> Optional[str]:
|
||||
"""Fetch file content. Returns None if file should be skipped (binary files or empty files)."""
|
||||
url = f"https://api.github.com/repos/{repo_url}/contents/{file_path}"
|
||||
response = requests.get(url, headers=self.headers)
|
||||
response = self._make_request(url)
|
||||
|
||||
if response.status_code == 200:
|
||||
content = response.json()
|
||||
mime_type, _ = mimetypes.guess_type(file_path) # Guess the MIME type based on the file extension
|
||||
content = response.json()
|
||||
|
||||
if content.get("encoding") == "base64":
|
||||
if mime_type and mime_type.startswith("text"): # Handle only text files
|
||||
try:
|
||||
decoded_content = base64.b64decode(content["content"]).decode("utf-8")
|
||||
return f"Filename: {file_path}\n\n{decoded_content}"
|
||||
except Exception as e:
|
||||
raise e
|
||||
else:
|
||||
return f"Filename: {file_path} is a binary file and was skipped."
|
||||
if content.get("encoding") == "base64":
|
||||
if self.is_text_file(file_path): # Handle only text files
|
||||
try:
|
||||
decoded_content = base64.b64decode(content["content"]).decode("utf-8").strip()
|
||||
# Skip empty files
|
||||
if not decoded_content:
|
||||
return None
|
||||
return decoded_content
|
||||
except Exception:
|
||||
# If decoding fails, it's probably a binary file
|
||||
return None
|
||||
else:
|
||||
return f"Filename: {file_path}\n\n{content['content']}"
|
||||
# Skip binary files by returning None
|
||||
return None
|
||||
else:
|
||||
response.raise_for_status()
|
||||
file_content = content['content'].strip()
|
||||
# Skip empty files
|
||||
if not file_content:
|
||||
return None
|
||||
return file_content
|
||||
|
||||
def _make_request(self, url: str, max_retries: int = 3) -> requests.Response:
|
||||
"""Make a request with retry logic for rate limiting"""
|
||||
for attempt in range(max_retries):
|
||||
response = requests.get(url, headers=self.headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response
|
||||
elif response.status_code == 403:
|
||||
# Check if it's a rate limit issue
|
||||
try:
|
||||
error_data = response.json()
|
||||
error_msg = error_data.get("message", "")
|
||||
|
||||
# Check rate limit headers
|
||||
remaining = response.headers.get("X-RateLimit-Remaining", "unknown")
|
||||
reset_time = response.headers.get("X-RateLimit-Reset", "unknown")
|
||||
|
||||
print(f"GitHub API 403 Error: {error_msg}")
|
||||
print(f"Rate limit remaining: {remaining}, Reset time: {reset_time}")
|
||||
|
||||
if "rate limit" in error_msg.lower():
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt # Exponential backoff
|
||||
print(f"Rate limit hit, waiting {wait_time} seconds before retry...")
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
|
||||
# Provide helpful error message
|
||||
if remaining == "0":
|
||||
raise Exception(f"GitHub API rate limit exceeded. Please set GITHUB_ACCESS_TOKEN environment variable. Reset time: {reset_time}")
|
||||
else:
|
||||
raise Exception(f"GitHub API error: {error_msg}. This may require authentication - set GITHUB_ACCESS_TOKEN environment variable.")
|
||||
except Exception as e:
|
||||
if isinstance(e, Exception) and "GitHub API" in str(e):
|
||||
raise
|
||||
# If we can't parse the response, raise the original error
|
||||
response.raise_for_status()
|
||||
else:
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
|
||||
def fetch_repo_files(self, repo_url: str, path: str = "") -> List[str]:
|
||||
url = f"https://api.github.com/repos/{repo_url}/contents/{path}"
|
||||
response = requests.get(url, headers={**self.headers, "Accept": "application/vnd.github.v3.raw"})
|
||||
response = self._make_request(url)
|
||||
|
||||
contents = response.json()
|
||||
|
||||
# Handle error responses from GitHub API
|
||||
if isinstance(contents, dict) and "message" in contents:
|
||||
raise Exception(f"GitHub API error: {contents.get('message')}")
|
||||
|
||||
# Ensure contents is a list
|
||||
if not isinstance(contents, list):
|
||||
raise TypeError(f"Expected list from GitHub API, got {type(contents).__name__}: {contents}")
|
||||
|
||||
files = []
|
||||
for item in contents:
|
||||
if item["type"] == "file":
|
||||
@@ -53,6 +144,15 @@ class GitHubLoader(BaseRemote):
|
||||
documents = []
|
||||
for file_path in files:
|
||||
content = self.fetch_file_content(repo_name, file_path)
|
||||
documents.append(Document(page_content=content, metadata={"title": file_path,
|
||||
"source": f"https://github.com/{repo_name}/blob/main/{file_path}"}))
|
||||
# Skip binary files (content is None)
|
||||
if content is None:
|
||||
continue
|
||||
documents.append(Document(
|
||||
text=content,
|
||||
doc_id=file_path,
|
||||
extra_info={
|
||||
"title": file_path,
|
||||
"source": f"https://github.com/{repo_name}/blob/main/{file_path}"
|
||||
}
|
||||
))
|
||||
return documents
|
||||
|
||||
@@ -27,7 +27,7 @@ class TestGitHubLoaderFetchFileContent:
|
||||
|
||||
result = loader.fetch_file_content("owner/repo", "README.md")
|
||||
|
||||
assert result == f"Filename: README.md\n\n{content_str}"
|
||||
assert result == content_str
|
||||
mock_get.assert_called_once_with(
|
||||
"https://api.github.com/repos/owner/repo/contents/README.md",
|
||||
headers=loader.headers,
|
||||
@@ -40,7 +40,7 @@ class TestGitHubLoaderFetchFileContent:
|
||||
|
||||
result = loader.fetch_file_content("owner/repo", "image.png")
|
||||
|
||||
assert result == "Filename: image.png is a binary file and was skipped."
|
||||
assert result is None
|
||||
|
||||
@patch("application.parser.remote.github_loader.requests.get")
|
||||
def test_non_base64_plain_content(self, mock_get):
|
||||
@@ -49,7 +49,7 @@ class TestGitHubLoaderFetchFileContent:
|
||||
|
||||
result = loader.fetch_file_content("owner/repo", "file.txt")
|
||||
|
||||
assert result == "Filename: file.txt\n\nPlain text"
|
||||
assert result == "Plain text"
|
||||
|
||||
@patch("application.parser.remote.github_loader.requests.get")
|
||||
def test_http_error_raises(self, mock_get):
|
||||
@@ -102,13 +102,13 @@ class TestGitHubLoaderLoadData:
|
||||
docs = loader.load_data("https://github.com/owner/repo")
|
||||
|
||||
assert len(docs) == 2
|
||||
assert docs[0].page_content == "content for README.md"
|
||||
assert docs[0].metadata == {
|
||||
assert docs[0].text == "content for README.md"
|
||||
assert docs[0].extra_info == {
|
||||
"title": "README.md",
|
||||
"source": "https://github.com/owner/repo/blob/main/README.md",
|
||||
}
|
||||
assert docs[1].page_content == "content for src/main.py"
|
||||
assert docs[1].metadata == {
|
||||
assert docs[1].text == "content for src/main.py"
|
||||
assert docs[1].extra_info == {
|
||||
"title": "src/main.py",
|
||||
"source": "https://github.com/owner/repo/blob/main/src/main.py",
|
||||
}
|
||||
@@ -142,12 +142,13 @@ class TestGitHubLoaderRobustness:
|
||||
GitHubLoader().fetch_file_content("owner/repo", "README.md")
|
||||
|
||||
@patch("application.parser.remote.github_loader.requests.get")
|
||||
def test_fetch_file_content_unexpected_shape_missing_content_raises(self, mock_get):
|
||||
def test_fetch_file_content_unexpected_shape_missing_content_returns_none(self, mock_get):
|
||||
# encoding indicates base64 text, but 'content' key is missing
|
||||
# With the new code, the exception is caught and returns None (treated as binary/skipped)
|
||||
resp = make_response({"encoding": "base64"})
|
||||
mock_get.return_value = resp
|
||||
with pytest.raises(KeyError):
|
||||
GitHubLoader().fetch_file_content("owner/repo", "README.md")
|
||||
result = GitHubLoader().fetch_file_content("owner/repo", "file.txt")
|
||||
assert result is None
|
||||
|
||||
@patch("application.parser.remote.github_loader.base64.b64decode")
|
||||
@patch("application.parser.remote.github_loader.requests.get")
|
||||
@@ -156,4 +157,4 @@ class TestGitHubLoaderRobustness:
|
||||
mock_b64decode.side_effect = AssertionError("b64decode should not be called for binary files")
|
||||
mock_get.return_value = make_response({"encoding": "base64", "content": "AAA"})
|
||||
result = GitHubLoader().fetch_file_content("owner/repo", "bigfile.bin")
|
||||
assert result == "Filename: bigfile.bin is a binary file and was skipped."
|
||||
assert result is None
|
||||
|
||||
Reference in New Issue
Block a user