diff --git a/HACKTOBERFEST.md b/HACKTOBERFEST.md index 631f73ba..47679960 100644 --- a/HACKTOBERFEST.md +++ b/HACKTOBERFEST.md @@ -7,7 +7,7 @@ All contributors with accepted PRs will receive a cool Holopin! 🤩 (Watch out ### 🏆 Top 50 contributors will recieve a special T-shirt ### 🏆 [LLM Document analysis by LexEU competition](https://github.com/arc53/DocsGPT/blob/main/lexeu-competition.md): -A separate competition is available for those who sumbit new retrieval / workflow method that will analyze a Document using EU laws. +A separate competition is available for those who submit new retrieval / workflow method that will analyze a Document using EU laws. With 200$, 100$, 50$ prize for 1st, 2nd and 3rd place respectively. You can find more information [here](https://github.com/arc53/DocsGPT/blob/main/lexeu-competition.md) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 340d020a..c409e69a 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -363,6 +363,7 @@ class UploadRemote(Resource): ), "name": fields.String(required=True, description="Job name"), "data": fields.String(required=True, description="Data to process"), + "repo_url": fields.String(description="GitHub repository URL"), }, ) ) @@ -377,11 +378,18 @@ class UploadRemote(Resource): return missing_fields try: + if "repo_url" in data: + source_data = data["repo_url"] + loader = "github" + else: + source_data = data["data"] + loader = data["source"] + task = ingest_remote.delay( - source_data=data["data"], + source_data=source_data, job_name=data["name"], user=data["user"], - loader=data["source"], + loader=loader, ) except Exception as err: return make_response(jsonify({"success": False, "error": str(err)}), 400) diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py index e69de29b..49f0ae9c 100644 --- a/application/parser/remote/github_loader.py +++ b/application/parser/remote/github_loader.py @@ -0,0 +1,53 @@ +import base64 +import requests +from typing import List +from application.parser.remote.base import BaseRemote +from langchain_core.documents import Document + +class GitHubLoader(BaseRemote): + def __init__(self): + self.access_token = None + self.headers = { + "Authorization": f"token {self.access_token}" + } if self.access_token else {} + return + + def fetch_file_content(self, repo_url: str, file_path: str) -> str: + url = f"https://api.github.com/repos/{repo_url}/contents/{file_path}" + response = requests.get(url, headers=self.headers) + + if response.status_code == 200: + content = response.json() + if content.get("encoding") == "base64": + try: + decoded_content = base64.b64decode(content["content"]).decode("utf-8") + return f"Filename: {file_path}\n\n{decoded_content}" + except Exception as e: + print(f"Error decoding content for {file_path}: {e}") + raise + else: + return f"Filename: {file_path}\n\n{content['content']}" + else: + response.raise_for_status() + + def fetch_repo_files(self, repo_url: str, path: str = "") -> List[str]: + url = f"https://api.github.com/repos/{repo_url}/contents/{path}" + response = requests.get(url, headers={**self.headers, "Accept": "application/vnd.github.v3.raw"}) + contents = response.json() + files = [] + for item in contents: + if item["type"] == "file": + files.append(item["path"]) + elif item["type"] == "dir": + files.extend(self.fetch_repo_files(repo_url, item["path"])) + return files + + def load_data(self, repo_url: str) -> List[Document]: + repo_name = repo_url.split("github.com/")[-1] + files = self.fetch_repo_files(repo_name) + documents = [] + for file_path in files: + content = self.fetch_file_content(repo_name, file_path) + documents.append(Document(page_content=content, metadata={"title": file_path, + "source": f"https://github.com/{repo_name}/blob/main/{file_path}"})) + return documents diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py index d2a58f8d..026abd76 100644 --- a/application/parser/remote/remote_creator.py +++ b/application/parser/remote/remote_creator.py @@ -2,6 +2,7 @@ from application.parser.remote.sitemap_loader import SitemapLoader from application.parser.remote.crawler_loader import CrawlerLoader from application.parser.remote.web_loader import WebLoader from application.parser.remote.reddit_loader import RedditPostsLoaderRemote +from application.parser.remote.github_loader import GitHubLoader class RemoteCreator: @@ -10,6 +11,7 @@ class RemoteCreator: "sitemap": SitemapLoader, "crawler": CrawlerLoader, "reddit": RedditPostsLoaderRemote, + "github": GitHubLoader, } @classmethod diff --git a/frontend/src/Hero.tsx b/frontend/src/Hero.tsx index 04da8769..644848dc 100644 --- a/frontend/src/Hero.tsx +++ b/frontend/src/Hero.tsx @@ -37,7 +37,7 @@ export default function Hero({