From 2611550ffd849e12568379cba230ab9bd947be67 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 2 Oct 2024 23:44:29 +0100 Subject: [PATCH 1/6] --- application/api/user/routes.py | 12 ++++- application/parser/remote/github_loader.py | 49 +++++++++++++++++++++ application/parser/remote/remote_creator.py | 2 + frontend/src/upload/Upload.tsx | 34 +++++++++++++- 4 files changed, 94 insertions(+), 3 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 340d020a..c409e69a 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -363,6 +363,7 @@ class UploadRemote(Resource): ), "name": fields.String(required=True, description="Job name"), "data": fields.String(required=True, description="Data to process"), + "repo_url": fields.String(description="GitHub repository URL"), }, ) ) @@ -377,11 +378,18 @@ class UploadRemote(Resource): return missing_fields try: + if "repo_url" in data: + source_data = data["repo_url"] + loader = "github" + else: + source_data = data["data"] + loader = data["source"] + task = ingest_remote.delay( - source_data=data["data"], + source_data=source_data, job_name=data["name"], user=data["user"], - loader=data["source"], + loader=loader, ) except Exception as err: return make_response(jsonify({"success": False, "error": str(err)}), 400) diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py index e69de29b..2839f48d 100644 --- a/application/parser/remote/github_loader.py +++ b/application/parser/remote/github_loader.py @@ -0,0 +1,49 @@ +import os +import base64 +import requests +from typing import List +from application.parser.remote.base import BaseRemote +from application.parser.schema.base import Document + +class GitHubLoader(BaseRemote): + def __init__(self, access_token: str): + self.access_token = access_token + + def fetch_file_content(self, repo_url: str, file_path: str) -> str: + url = f"https://api.github.com/repos/{repo_url}/contents/{file_path}" + headers = { + "Authorization": f"token {self.access_token}", + "Accept": "application/vnd.github.v3.raw" + } + response = requests.get(url, headers=headers) + response.raise_for_status() + content = response.json() + if content.get("encoding") == "base64": + return base64.b64decode(content["content"]).decode("utf-8") + return content["content"] + + def fetch_repo_files(self, repo_url: str, path: str = "") -> List[str]: + url = f"https://api.github.com/repos/{repo_url}/contents/{path}" + headers = { + "Authorization": f"token {self.access_token}", + "Accept": "application/vnd.github.v3.raw" + } + response = requests.get(url, headers=headers) + response.raise_for_status() + contents = response.json() + files = [] + for item in contents: + if item["type"] == "file": + files.append(item["path"]) + elif item["type"] == "dir": + files.extend(self.fetch_repo_files(repo_url, item["path"])) + return files + + def load_data(self, repo_url: str) -> List[Document]: + repo_name = repo_url.split("github.com/")[-1] + files = self.fetch_repo_files(repo_name) + documents = [] + for file_path in files: + content = self.fetch_file_content(repo_name, file_path) + documents.append(Document(content=content, metadata={"file_path": file_path})) + return documents diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py index d2a58f8d..026abd76 100644 --- a/application/parser/remote/remote_creator.py +++ b/application/parser/remote/remote_creator.py @@ -2,6 +2,7 @@ from application.parser.remote.sitemap_loader import SitemapLoader from application.parser.remote.crawler_loader import CrawlerLoader from application.parser.remote.web_loader import WebLoader from application.parser.remote.reddit_loader import RedditPostsLoaderRemote +from application.parser.remote.github_loader import GitHubLoader class RemoteCreator: @@ -10,6 +11,7 @@ class RemoteCreator: "sitemap": SitemapLoader, "crawler": CrawlerLoader, "reddit": RedditPostsLoaderRemote, + "github": GitHubLoader, } @classmethod diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index b898e4b6..50a6d357 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -24,6 +24,7 @@ function Upload({ const [docName, setDocName] = useState(''); const [urlName, setUrlName] = useState(''); const [url, setUrl] = useState(''); + const [repoUrl, setRepoUrl] = useState(''); // P3f93 const [redditData, setRedditData] = useState({ client_id: '', client_secret: '', @@ -48,6 +49,7 @@ function Upload({ // { label: 'Sitemap', value: 'sitemap' }, { label: 'Link', value: 'url' }, { label: 'Reddit', value: 'reddit' }, + { label: 'GitHub', value: 'github' }, // P3f93 ]; const [urlType, setUrlType] = useState<{ label: string; value: string }>({ @@ -238,6 +240,9 @@ function Upload({ formData.set('name', 'other'); formData.set('data', JSON.stringify(redditData)); } + if (urlType.value === 'github') { + formData.append('repo_url', repoUrl); // Pdeac + } const apiHost = import.meta.env.VITE_API_HOST; const xhr = new XMLHttpRequest(); xhr.upload.addEventListener('progress', (event) => { @@ -376,7 +381,7 @@ function Upload({ size="w-full" rounded="3xl" /> - {urlType.label !== 'Reddit' ? ( + {urlType.label !== 'Reddit' && urlType.label !== 'GitHub' ? ( <> + ) : urlType.label === 'GitHub' ? ( // P3f93 + <> + setUrlName(e.target.value)} + borderVariant="thin" + > +
+ + {t('modals.uploadDoc.name')} + +
+ setRepoUrl(e.target.value)} + borderVariant="thin" + > +
+ + {t('modals.uploadDoc.repoUrl')} + +
+ ) : (
From 8fa88175c1fe9ecba31c74aa9f17bf59f1aaddc7 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 5 Oct 2024 21:33:58 +0100 Subject: [PATCH 2/6] fix: translation + auth --- application/parser/remote/github_loader.py | 43 ++++++++++++---------- frontend/src/locale/en.json | 1 + frontend/src/locale/es.json | 1 + frontend/src/locale/jp.json | 1 + frontend/src/locale/zh.json | 1 + 5 files changed, 27 insertions(+), 20 deletions(-) diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py index 2839f48d..f72d5278 100644 --- a/application/parser/remote/github_loader.py +++ b/application/parser/remote/github_loader.py @@ -1,34 +1,37 @@ -import os import base64 import requests from typing import List from application.parser.remote.base import BaseRemote -from application.parser.schema.base import Document +from langchain_core.documents import Document class GitHubLoader(BaseRemote): - def __init__(self, access_token: str): - self.access_token = access_token + def __init__(self): + self.access_token = None + self.headers = { + "Authorization": f"token {self.access_token}" + } if self.access_token else {} + return def fetch_file_content(self, repo_url: str, file_path: str) -> str: url = f"https://api.github.com/repos/{repo_url}/contents/{file_path}" - headers = { - "Authorization": f"token {self.access_token}", - "Accept": "application/vnd.github.v3.raw" - } - response = requests.get(url, headers=headers) - response.raise_for_status() - content = response.json() - if content.get("encoding") == "base64": - return base64.b64decode(content["content"]).decode("utf-8") - return content["content"] + response = requests.get(url, headers=self.headers) + + if response.status_code == 200: + content = response.json() + if content.get("encoding") == "base64": + try: + decoded_content = base64.b64decode(content["content"]).decode("utf-8") + return decoded_content + except Exception as e: + raise + else: + return content["content"] + else: + response.raise_for_status() def fetch_repo_files(self, repo_url: str, path: str = "") -> List[str]: url = f"https://api.github.com/repos/{repo_url}/contents/{path}" - headers = { - "Authorization": f"token {self.access_token}", - "Accept": "application/vnd.github.v3.raw" - } - response = requests.get(url, headers=headers) + response = requests.get(url, headers=self.headers) response.raise_for_status() contents = response.json() files = [] @@ -45,5 +48,5 @@ class GitHubLoader(BaseRemote): documents = [] for file_path in files: content = self.fetch_file_content(repo_name, file_path) - documents.append(Document(content=content, metadata={"file_path": file_path})) + documents.append(Document(page_content=content, metadata={"file_path": file_path})) return documents diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index fa2cac3c..c9b599bf 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -85,6 +85,7 @@ "train": "Train", "link": "Link", "urlLink": "URL Link", + "repoUrl": "Repository URL", "reddit": { "id": "Client ID", "secret": "Client Secret", diff --git a/frontend/src/locale/es.json b/frontend/src/locale/es.json index 7b7dbec0..98b38d7c 100644 --- a/frontend/src/locale/es.json +++ b/frontend/src/locale/es.json @@ -85,6 +85,7 @@ "train": "Entrenar", "link": "Enlace", "urlLink": "Enlace URL", + "repoUrl": "URL del Repositorio", "reddit": { "id": "ID de Cliente", "secret": "Secreto de Cliente", diff --git a/frontend/src/locale/jp.json b/frontend/src/locale/jp.json index fa61c291..b34cc5e5 100644 --- a/frontend/src/locale/jp.json +++ b/frontend/src/locale/jp.json @@ -85,6 +85,7 @@ "train": "トレーニング", "link": "リンク", "urlLink": "URLリンク", + "repoUrl": "リポジトリURL", "reddit": { "id": "クライアントID", "secret": "クライアントシークレット", diff --git a/frontend/src/locale/zh.json b/frontend/src/locale/zh.json index 080c4ee3..7decdefe 100644 --- a/frontend/src/locale/zh.json +++ b/frontend/src/locale/zh.json @@ -85,6 +85,7 @@ "train": "训练", "link": "链接", "urlLink": "URL 链接", + "repoUrl": "存储库 URL", "reddit": { "id": "客户端 ID", "secret": "客户端密钥", From 1ad82c22d977cdd40dd27ca2afe8b4a5e7b37d26 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 5 Oct 2024 21:36:04 +0100 Subject: [PATCH 3/6] fix: headers --- application/parser/remote/github_loader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py index f72d5278..8ffa5af1 100644 --- a/application/parser/remote/github_loader.py +++ b/application/parser/remote/github_loader.py @@ -31,8 +31,7 @@ class GitHubLoader(BaseRemote): def fetch_repo_files(self, repo_url: str, path: str = "") -> List[str]: url = f"https://api.github.com/repos/{repo_url}/contents/{path}" - response = requests.get(url, headers=self.headers) - response.raise_for_status() + response = requests.get(url, headers={**self.headers, "Accept": "application/vnd.github.v3.raw"}) contents = response.json() files = [] for item in contents: From 7717242112b7ed4752eb73d3ded5761a7d170a10 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 5 Oct 2024 21:37:55 +0100 Subject: [PATCH 4/6] fix(lint): ruff var --- application/parser/remote/github_loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py index 8ffa5af1..2137e62a 100644 --- a/application/parser/remote/github_loader.py +++ b/application/parser/remote/github_loader.py @@ -23,6 +23,7 @@ class GitHubLoader(BaseRemote): decoded_content = base64.b64decode(content["content"]).decode("utf-8") return decoded_content except Exception as e: + print(f"Error decoding content for {file_path}: {e}") raise else: return content["content"] From c04687fdd1cc797121bd030d0f6eafde5b18d5d2 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 5 Oct 2024 21:53:30 +0100 Subject: [PATCH 5/6] fix: github loader metadata clickable --- application/parser/remote/github_loader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py index 2137e62a..d35da20d 100644 --- a/application/parser/remote/github_loader.py +++ b/application/parser/remote/github_loader.py @@ -48,5 +48,6 @@ class GitHubLoader(BaseRemote): documents = [] for file_path in files: content = self.fetch_file_content(repo_name, file_path) - documents.append(Document(page_content=content, metadata={"file_path": file_path})) + documents.append(Document(page_content=content, metadata={"title": file_path, + "source": f"https://github.com/{repo_name}/blob/main/{file_path}"})) return documents From 6932c7e3e9e9231b8a1bbe8bceeffdc4ece3388d Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 5 Oct 2024 21:56:47 +0100 Subject: [PATCH 6/6] feat: add filename to the top --- application/parser/remote/github_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py index d35da20d..49f0ae9c 100644 --- a/application/parser/remote/github_loader.py +++ b/application/parser/remote/github_loader.py @@ -21,12 +21,12 @@ class GitHubLoader(BaseRemote): if content.get("encoding") == "base64": try: decoded_content = base64.b64decode(content["content"]).decode("utf-8") - return decoded_content + return f"Filename: {file_path}\n\n{decoded_content}" except Exception as e: print(f"Error decoding content for {file_path}: {e}") raise else: - return content["content"] + return f"Filename: {file_path}\n\n{content['content']}" else: response.raise_for_status()