diff --git a/application/core/settings.py b/application/core/settings.py index 4475c443..2dc159ba 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -51,6 +51,9 @@ class Settings(BaseSettings): "http://127.0.0.1:7091/api/connectors/callback" ##add redirect url as it is to your provider's console(gcp) ) + # GitHub source + GITHUB_ACCESS_TOKEN: Optional[str] = None # PAT token with read repo access + # LLM Cache CACHE_REDIS_URL: str = "redis://localhost:6379/2" diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py index 8f805056..327e59a2 100644 --- a/application/parser/remote/github_loader.py +++ b/application/parser/remote/github_loader.py @@ -1,44 +1,135 @@ import base64 import requests -from typing import List +import time +from typing import List, Optional from application.parser.remote.base import BaseRemote -from langchain_core.documents import Document +from application.parser.schema.base import Document import mimetypes +from application.core.settings import settings class GitHubLoader(BaseRemote): def __init__(self): - self.access_token = None + self.access_token = settings.GITHUB_ACCESS_TOKEN self.headers = { - "Authorization": f"token {self.access_token}" - } if self.access_token else {} + "Authorization": f"token {self.access_token}", + "Accept": "application/vnd.github.v3+json" + } if self.access_token else { + "Accept": "application/vnd.github.v3+json" + } return - def fetch_file_content(self, repo_url: str, file_path: str) -> str: + def is_text_file(self, file_path: str) -> bool: + """Determine if a file is a text file based on extension.""" + # Common text file extensions + text_extensions = { + '.txt', '.md', '.markdown', '.rst', '.json', '.xml', '.yaml', '.yml', + '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.c', '.cpp', '.h', '.hpp', + '.cs', '.go', '.rs', '.rb', '.php', '.swift', '.kt', '.scala', + '.html', '.css', '.scss', '.sass', '.less', + '.sh', '.bash', '.zsh', '.fish', + '.sql', '.r', '.m', '.mat', + '.ini', '.cfg', '.conf', '.config', '.env', + '.gitignore', '.dockerignore', '.editorconfig', + '.log', '.csv', '.tsv' + } + + # Get file extension + file_lower = file_path.lower() + for ext in text_extensions: + if file_lower.endswith(ext): + return True + + # Also check MIME type + mime_type, _ = mimetypes.guess_type(file_path) + if mime_type and (mime_type.startswith("text") or mime_type in ["application/json", "application/xml"]): + return True + + return False + + def fetch_file_content(self, repo_url: str, file_path: str) -> Optional[str]: + """Fetch file content. Returns None if file should be skipped (binary files or empty files).""" url = f"https://api.github.com/repos/{repo_url}/contents/{file_path}" - response = requests.get(url, headers=self.headers) + response = self._make_request(url) - if response.status_code == 200: - content = response.json() - mime_type, _ = mimetypes.guess_type(file_path) # Guess the MIME type based on the file extension + content = response.json() - if content.get("encoding") == "base64": - if mime_type and mime_type.startswith("text"): # Handle only text files - try: - decoded_content = base64.b64decode(content["content"]).decode("utf-8") - return f"Filename: {file_path}\n\n{decoded_content}" - except Exception as e: - raise e - else: - return f"Filename: {file_path} is a binary file and was skipped." + if content.get("encoding") == "base64": + if self.is_text_file(file_path): # Handle only text files + try: + decoded_content = base64.b64decode(content["content"]).decode("utf-8").strip() + # Skip empty files + if not decoded_content: + return None + return decoded_content + except Exception: + # If decoding fails, it's probably a binary file + return None else: - return f"Filename: {file_path}\n\n{content['content']}" + # Skip binary files by returning None + return None else: - response.raise_for_status() + file_content = content['content'].strip() + # Skip empty files + if not file_content: + return None + return file_content + + def _make_request(self, url: str, max_retries: int = 3) -> requests.Response: + """Make a request with retry logic for rate limiting""" + for attempt in range(max_retries): + response = requests.get(url, headers=self.headers) + + if response.status_code == 200: + return response + elif response.status_code == 403: + # Check if it's a rate limit issue + try: + error_data = response.json() + error_msg = error_data.get("message", "") + + # Check rate limit headers + remaining = response.headers.get("X-RateLimit-Remaining", "unknown") + reset_time = response.headers.get("X-RateLimit-Reset", "unknown") + + print(f"GitHub API 403 Error: {error_msg}") + print(f"Rate limit remaining: {remaining}, Reset time: {reset_time}") + + if "rate limit" in error_msg.lower(): + if attempt < max_retries - 1: + wait_time = 2 ** attempt # Exponential backoff + print(f"Rate limit hit, waiting {wait_time} seconds before retry...") + time.sleep(wait_time) + continue + + # Provide helpful error message + if remaining == "0": + raise Exception(f"GitHub API rate limit exceeded. Please set GITHUB_ACCESS_TOKEN environment variable. Reset time: {reset_time}") + else: + raise Exception(f"GitHub API error: {error_msg}. This may require authentication - set GITHUB_ACCESS_TOKEN environment variable.") + except Exception as e: + if isinstance(e, Exception) and "GitHub API" in str(e): + raise + # If we can't parse the response, raise the original error + response.raise_for_status() + else: + response.raise_for_status() + + return response def fetch_repo_files(self, repo_url: str, path: str = "") -> List[str]: url = f"https://api.github.com/repos/{repo_url}/contents/{path}" - response = requests.get(url, headers={**self.headers, "Accept": "application/vnd.github.v3.raw"}) + response = self._make_request(url) + contents = response.json() + + # Handle error responses from GitHub API + if isinstance(contents, dict) and "message" in contents: + raise Exception(f"GitHub API error: {contents.get('message')}") + + # Ensure contents is a list + if not isinstance(contents, list): + raise TypeError(f"Expected list from GitHub API, got {type(contents).__name__}: {contents}") + files = [] for item in contents: if item["type"] == "file": @@ -53,6 +144,15 @@ class GitHubLoader(BaseRemote): documents = [] for file_path in files: content = self.fetch_file_content(repo_name, file_path) - documents.append(Document(page_content=content, metadata={"title": file_path, - "source": f"https://github.com/{repo_name}/blob/main/{file_path}"})) + # Skip binary files (content is None) + if content is None: + continue + documents.append(Document( + text=content, + doc_id=file_path, + extra_info={ + "title": file_path, + "source": f"https://github.com/{repo_name}/blob/main/{file_path}" + } + )) return documents diff --git a/tests/parser/remote/test_github_loader.py b/tests/parser/remote/test_github_loader.py index 6bb3ed2e..f52003c1 100644 --- a/tests/parser/remote/test_github_loader.py +++ b/tests/parser/remote/test_github_loader.py @@ -27,7 +27,7 @@ class TestGitHubLoaderFetchFileContent: result = loader.fetch_file_content("owner/repo", "README.md") - assert result == f"Filename: README.md\n\n{content_str}" + assert result == content_str mock_get.assert_called_once_with( "https://api.github.com/repos/owner/repo/contents/README.md", headers=loader.headers, @@ -40,7 +40,7 @@ class TestGitHubLoaderFetchFileContent: result = loader.fetch_file_content("owner/repo", "image.png") - assert result == "Filename: image.png is a binary file and was skipped." + assert result is None @patch("application.parser.remote.github_loader.requests.get") def test_non_base64_plain_content(self, mock_get): @@ -49,7 +49,7 @@ class TestGitHubLoaderFetchFileContent: result = loader.fetch_file_content("owner/repo", "file.txt") - assert result == "Filename: file.txt\n\nPlain text" + assert result == "Plain text" @patch("application.parser.remote.github_loader.requests.get") def test_http_error_raises(self, mock_get): @@ -102,13 +102,13 @@ class TestGitHubLoaderLoadData: docs = loader.load_data("https://github.com/owner/repo") assert len(docs) == 2 - assert docs[0].page_content == "content for README.md" - assert docs[0].metadata == { + assert docs[0].text == "content for README.md" + assert docs[0].extra_info == { "title": "README.md", "source": "https://github.com/owner/repo/blob/main/README.md", } - assert docs[1].page_content == "content for src/main.py" - assert docs[1].metadata == { + assert docs[1].text == "content for src/main.py" + assert docs[1].extra_info == { "title": "src/main.py", "source": "https://github.com/owner/repo/blob/main/src/main.py", } @@ -142,12 +142,13 @@ class TestGitHubLoaderRobustness: GitHubLoader().fetch_file_content("owner/repo", "README.md") @patch("application.parser.remote.github_loader.requests.get") - def test_fetch_file_content_unexpected_shape_missing_content_raises(self, mock_get): + def test_fetch_file_content_unexpected_shape_missing_content_returns_none(self, mock_get): # encoding indicates base64 text, but 'content' key is missing + # With the new code, the exception is caught and returns None (treated as binary/skipped) resp = make_response({"encoding": "base64"}) mock_get.return_value = resp - with pytest.raises(KeyError): - GitHubLoader().fetch_file_content("owner/repo", "README.md") + result = GitHubLoader().fetch_file_content("owner/repo", "file.txt") + assert result is None @patch("application.parser.remote.github_loader.base64.b64decode") @patch("application.parser.remote.github_loader.requests.get") @@ -156,4 +157,4 @@ class TestGitHubLoaderRobustness: mock_b64decode.side_effect = AssertionError("b64decode should not be called for binary files") mock_get.return_value = make_response({"encoding": "base64", "content": "AAA"}) result = GitHubLoader().fetch_file_content("owner/repo", "bigfile.bin") - assert result == "Filename: bigfile.bin is a binary file and was skipped." + assert result is None