feat: add GitHub access token support and fix file content fetching logic

2025-11-29 08:33:20 +00:00 · 2025-10-07 14:44:37 +01:00
3 changed files with 139 additions and 35 deletions
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -51,6 +51,9 @@ class Settings(BaseSettings):
        "http://127.0.0.1:7091/api/connectors/callback"  ##add redirect url as it is to your provider's console(gcp)
    )

+    # GitHub source
+    GITHUB_ACCESS_TOKEN: Optional[str] = None # PAT token with read repo access
+
    # LLM Cache
    CACHE_REDIS_URL: str = "redis://localhost:6379/2"

--- a/application/parser/remote/github_loader.py
+++ b/application/parser/remote/github_loader.py
@@ -1,44 +1,135 @@
 import base64
 import requests
-from typing import List
+import time
+from typing import List, Optional
 from application.parser.remote.base import BaseRemote
-from langchain_core.documents import Document
+from application.parser.schema.base import Document
 import mimetypes
+from application.core.settings import settings

 class GitHubLoader(BaseRemote):
    def __init__(self):
-        self.access_token = None
+        self.access_token = settings.GITHUB_ACCESS_TOKEN
        self.headers = {
-            "Authorization": f"token {self.access_token}"
-        } if self.access_token else {}
+            "Authorization": f"token {self.access_token}",
+            "Accept": "application/vnd.github.v3+json"
+        } if self.access_token else {
+            "Accept": "application/vnd.github.v3+json"
+        }
        return

-    def fetch_file_content(self, repo_url: str, file_path: str) -> str:
+    def is_text_file(self, file_path: str) -> bool:
+        """Determine if a file is a text file based on extension."""
+        # Common text file extensions
+        text_extensions = {
+            '.txt', '.md', '.markdown', '.rst', '.json', '.xml', '.yaml', '.yml',
+            '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.c', '.cpp', '.h', '.hpp',
+            '.cs', '.go', '.rs', '.rb', '.php', '.swift', '.kt', '.scala',
+            '.html', '.css', '.scss', '.sass', '.less',
+            '.sh', '.bash', '.zsh', '.fish',
+            '.sql', '.r', '.m', '.mat',
+            '.ini', '.cfg', '.conf', '.config', '.env',
+            '.gitignore', '.dockerignore', '.editorconfig',
+            '.log', '.csv', '.tsv'
+        }
+
+        # Get file extension
+        file_lower = file_path.lower()
+        for ext in text_extensions:
+            if file_lower.endswith(ext):
+                return True
+
+        # Also check MIME type
+        mime_type, _ = mimetypes.guess_type(file_path)
+        if mime_type and (mime_type.startswith("text") or mime_type in ["application/json", "application/xml"]):
+            return True
+
+        return False
+
+    def fetch_file_content(self, repo_url: str, file_path: str) -> Optional[str]:
+        """Fetch file content. Returns None if file should be skipped (binary files or empty files)."""
        url = f"https://api.github.com/repos/{repo_url}/contents/{file_path}"
-        response = requests.get(url, headers=self.headers)
+        response = self._make_request(url)

-        if response.status_code == 200:
-            content = response.json()
-            mime_type, _ = mimetypes.guess_type(file_path)  # Guess the MIME type based on the file extension
+        content = response.json()

-            if content.get("encoding") == "base64":
-                if mime_type and mime_type.startswith("text"):  # Handle only text files
-                    try:
-                        decoded_content = base64.b64decode(content["content"]).decode("utf-8")
-                        return f"Filename: {file_path}\n\n{decoded_content}"
-                    except Exception as e:
-                        raise e
-                else:
-                    return f"Filename: {file_path} is a binary file and was skipped."
+        if content.get("encoding") == "base64":
+            if self.is_text_file(file_path):  # Handle only text files
+                try:
+                    decoded_content = base64.b64decode(content["content"]).decode("utf-8").strip()
+                    # Skip empty files
+                    if not decoded_content:
+                        return None
+                    return decoded_content
+                except Exception:
+                    # If decoding fails, it's probably a binary file
+                    return None
            else:
-                return f"Filename: {file_path}\n\n{content['content']}"
+                # Skip binary files by returning None
+                return None
        else:
-            response.raise_for_status()
+            file_content = content['content'].strip()
+            # Skip empty files
+            if not file_content:
+                return None
+            return file_content
+
+    def _make_request(self, url: str, max_retries: int = 3) -> requests.Response:
+        """Make a request with retry logic for rate limiting"""
+        for attempt in range(max_retries):
+            response = requests.get(url, headers=self.headers)
+
+            if response.status_code == 200:
+                return response
+            elif response.status_code == 403:
+                # Check if it's a rate limit issue
+                try:
+                    error_data = response.json()
+                    error_msg = error_data.get("message", "")
+
+                    # Check rate limit headers
+                    remaining = response.headers.get("X-RateLimit-Remaining", "unknown")
+                    reset_time = response.headers.get("X-RateLimit-Reset", "unknown")
+
+                    print(f"GitHub API 403 Error: {error_msg}")
+                    print(f"Rate limit remaining: {remaining}, Reset time: {reset_time}")
+
+                    if "rate limit" in error_msg.lower():
+                        if attempt < max_retries - 1:
+                            wait_time = 2 ** attempt  # Exponential backoff
+                            print(f"Rate limit hit, waiting {wait_time} seconds before retry...")
+                            time.sleep(wait_time)
+                            continue
+
+                    # Provide helpful error message
+                    if remaining == "0":
+                        raise Exception(f"GitHub API rate limit exceeded. Please set GITHUB_ACCESS_TOKEN environment variable. Reset time: {reset_time}")
+                    else:
+                        raise Exception(f"GitHub API error: {error_msg}. This may require authentication - set GITHUB_ACCESS_TOKEN environment variable.")
+                except Exception as e:
+                    if isinstance(e, Exception) and "GitHub API" in str(e):
+                        raise
+                    # If we can't parse the response, raise the original error
+                    response.raise_for_status()
+            else:
+                response.raise_for_status()
+
+        return response

    def fetch_repo_files(self, repo_url: str, path: str = "") -> List[str]:
        url = f"https://api.github.com/repos/{repo_url}/contents/{path}"
-        response = requests.get(url, headers={**self.headers, "Accept": "application/vnd.github.v3.raw"})
+        response = self._make_request(url)
+
        contents = response.json()
+
+        # Handle error responses from GitHub API
+        if isinstance(contents, dict) and "message" in contents:
+            raise Exception(f"GitHub API error: {contents.get('message')}")
+
+        # Ensure contents is a list
+        if not isinstance(contents, list):
+            raise TypeError(f"Expected list from GitHub API, got {type(contents).__name__}: {contents}")
+
        files = []
        for item in contents:
            if item["type"] == "file":
@@ -53,6 +144,15 @@ class GitHubLoader(BaseRemote):
        documents = []
        for file_path in files:
            content = self.fetch_file_content(repo_name, file_path)
-            documents.append(Document(page_content=content, metadata={"title": file_path, 
-            "source": f"https://github.com/{repo_name}/blob/main/{file_path}"}))
+            # Skip binary files (content is None)
+            if content is None:
+                continue
+            documents.append(Document(
+                text=content,
+                doc_id=file_path,
+                extra_info={
+                    "title": file_path,
+                    "source": f"https://github.com/{repo_name}/blob/main/{file_path}"
+                }
+            ))
        return documents
--- a/tests/parser/remote/test_github_loader.py
+++ b/tests/parser/remote/test_github_loader.py
@@ -27,7 +27,7 @@ class TestGitHubLoaderFetchFileContent:

        result = loader.fetch_file_content("owner/repo", "README.md")

-        assert result == f"Filename: README.md\n\n{content_str}"
+        assert result == content_str
        mock_get.assert_called_once_with(
            "https://api.github.com/repos/owner/repo/contents/README.md",
            headers=loader.headers,
@@ -40,7 +40,7 @@ class TestGitHubLoaderFetchFileContent:

        result = loader.fetch_file_content("owner/repo", "image.png")

-        assert result == "Filename: image.png is a binary file and was skipped."
+        assert result is None

    @patch("application.parser.remote.github_loader.requests.get")
    def test_non_base64_plain_content(self, mock_get):
@@ -49,7 +49,7 @@ class TestGitHubLoaderFetchFileContent:

        result = loader.fetch_file_content("owner/repo", "file.txt")

-        assert result == "Filename: file.txt\n\nPlain text"
+        assert result == "Plain text"

    @patch("application.parser.remote.github_loader.requests.get")
    def test_http_error_raises(self, mock_get):
@@ -102,13 +102,13 @@ class TestGitHubLoaderLoadData:
        docs = loader.load_data("https://github.com/owner/repo")

        assert len(docs) == 2
-        assert docs[0].page_content == "content for README.md"
-        assert docs[0].metadata == {
+        assert docs[0].text == "content for README.md"
+        assert docs[0].extra_info == {
            "title": "README.md",
            "source": "https://github.com/owner/repo/blob/main/README.md",
        }
-        assert docs[1].page_content == "content for src/main.py"
-        assert docs[1].metadata == {
+        assert docs[1].text == "content for src/main.py"
+        assert docs[1].extra_info == {
            "title": "src/main.py",
            "source": "https://github.com/owner/repo/blob/main/src/main.py",
        }
@@ -142,12 +142,13 @@ class TestGitHubLoaderRobustness:
            GitHubLoader().fetch_file_content("owner/repo", "README.md")

    @patch("application.parser.remote.github_loader.requests.get")
-    def test_fetch_file_content_unexpected_shape_missing_content_raises(self, mock_get):
+    def test_fetch_file_content_unexpected_shape_missing_content_returns_none(self, mock_get):
        # encoding indicates base64 text, but 'content' key is missing
+        # With the new code, the exception is caught and returns None (treated as binary/skipped)
        resp = make_response({"encoding": "base64"})
        mock_get.return_value = resp
-        with pytest.raises(KeyError):
-            GitHubLoader().fetch_file_content("owner/repo", "README.md")
+        result = GitHubLoader().fetch_file_content("owner/repo", "file.txt")
+        assert result is None

    @patch("application.parser.remote.github_loader.base64.b64decode")
    @patch("application.parser.remote.github_loader.requests.get")
@@ -156,4 +157,4 @@ class TestGitHubLoaderRobustness:
        mock_b64decode.side_effect = AssertionError("b64decode should not be called for binary files")
        mock_get.return_value = make_response({"encoding": "base64", "content": "AAA"})
        result = GitHubLoader().fetch_file_content("owner/repo", "bigfile.bin")
-        assert result == "Filename: bigfile.bin is a binary file and was skipped."
+        assert result is None