mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-12-01 09:33:14 +00:00
Merge pull request #1205 from arc53/dartpain/add-github-loader
Github remote loader
This commit is contained in:
@@ -363,6 +363,7 @@ class UploadRemote(Resource):
|
||||
),
|
||||
"name": fields.String(required=True, description="Job name"),
|
||||
"data": fields.String(required=True, description="Data to process"),
|
||||
"repo_url": fields.String(description="GitHub repository URL"),
|
||||
},
|
||||
)
|
||||
)
|
||||
@@ -377,11 +378,18 @@ class UploadRemote(Resource):
|
||||
return missing_fields
|
||||
|
||||
try:
|
||||
if "repo_url" in data:
|
||||
source_data = data["repo_url"]
|
||||
loader = "github"
|
||||
else:
|
||||
source_data = data["data"]
|
||||
loader = data["source"]
|
||||
|
||||
task = ingest_remote.delay(
|
||||
source_data=data["data"],
|
||||
source_data=source_data,
|
||||
job_name=data["name"],
|
||||
user=data["user"],
|
||||
loader=data["source"],
|
||||
loader=loader,
|
||||
)
|
||||
except Exception as err:
|
||||
return make_response(jsonify({"success": False, "error": str(err)}), 400)
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
import base64
|
||||
import requests
|
||||
from typing import List
|
||||
from application.parser.remote.base import BaseRemote
|
||||
from langchain_core.documents import Document
|
||||
|
||||
class GitHubLoader(BaseRemote):
|
||||
def __init__(self):
|
||||
self.access_token = None
|
||||
self.headers = {
|
||||
"Authorization": f"token {self.access_token}"
|
||||
} if self.access_token else {}
|
||||
return
|
||||
|
||||
def fetch_file_content(self, repo_url: str, file_path: str) -> str:
|
||||
url = f"https://api.github.com/repos/{repo_url}/contents/{file_path}"
|
||||
response = requests.get(url, headers=self.headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
content = response.json()
|
||||
if content.get("encoding") == "base64":
|
||||
try:
|
||||
decoded_content = base64.b64decode(content["content"]).decode("utf-8")
|
||||
return f"Filename: {file_path}\n\n{decoded_content}"
|
||||
except Exception as e:
|
||||
print(f"Error decoding content for {file_path}: {e}")
|
||||
raise
|
||||
else:
|
||||
return f"Filename: {file_path}\n\n{content['content']}"
|
||||
else:
|
||||
response.raise_for_status()
|
||||
|
||||
def fetch_repo_files(self, repo_url: str, path: str = "") -> List[str]:
|
||||
url = f"https://api.github.com/repos/{repo_url}/contents/{path}"
|
||||
response = requests.get(url, headers={**self.headers, "Accept": "application/vnd.github.v3.raw"})
|
||||
contents = response.json()
|
||||
files = []
|
||||
for item in contents:
|
||||
if item["type"] == "file":
|
||||
files.append(item["path"])
|
||||
elif item["type"] == "dir":
|
||||
files.extend(self.fetch_repo_files(repo_url, item["path"]))
|
||||
return files
|
||||
|
||||
def load_data(self, repo_url: str) -> List[Document]:
|
||||
repo_name = repo_url.split("github.com/")[-1]
|
||||
files = self.fetch_repo_files(repo_name)
|
||||
documents = []
|
||||
for file_path in files:
|
||||
content = self.fetch_file_content(repo_name, file_path)
|
||||
documents.append(Document(page_content=content, metadata={"title": file_path,
|
||||
"source": f"https://github.com/{repo_name}/blob/main/{file_path}"}))
|
||||
return documents
|
||||
|
||||
@@ -2,6 +2,7 @@ from application.parser.remote.sitemap_loader import SitemapLoader
|
||||
from application.parser.remote.crawler_loader import CrawlerLoader
|
||||
from application.parser.remote.web_loader import WebLoader
|
||||
from application.parser.remote.reddit_loader import RedditPostsLoaderRemote
|
||||
from application.parser.remote.github_loader import GitHubLoader
|
||||
|
||||
|
||||
class RemoteCreator:
|
||||
@@ -10,6 +11,7 @@ class RemoteCreator:
|
||||
"sitemap": SitemapLoader,
|
||||
"crawler": CrawlerLoader,
|
||||
"reddit": RedditPostsLoaderRemote,
|
||||
"github": GitHubLoader,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -85,6 +85,7 @@
|
||||
"train": "Train",
|
||||
"link": "Link",
|
||||
"urlLink": "URL Link",
|
||||
"repoUrl": "Repository URL",
|
||||
"reddit": {
|
||||
"id": "Client ID",
|
||||
"secret": "Client Secret",
|
||||
|
||||
@@ -85,6 +85,7 @@
|
||||
"train": "Entrenar",
|
||||
"link": "Enlace",
|
||||
"urlLink": "Enlace URL",
|
||||
"repoUrl": "URL del Repositorio",
|
||||
"reddit": {
|
||||
"id": "ID de Cliente",
|
||||
"secret": "Secreto de Cliente",
|
||||
|
||||
@@ -85,6 +85,7 @@
|
||||
"train": "トレーニング",
|
||||
"link": "リンク",
|
||||
"urlLink": "URLリンク",
|
||||
"repoUrl": "リポジトリURL",
|
||||
"reddit": {
|
||||
"id": "クライアントID",
|
||||
"secret": "クライアントシークレット",
|
||||
|
||||
@@ -85,6 +85,7 @@
|
||||
"train": "训练",
|
||||
"link": "链接",
|
||||
"urlLink": "URL 链接",
|
||||
"repoUrl": "存储库 URL",
|
||||
"reddit": {
|
||||
"id": "客户端 ID",
|
||||
"secret": "客户端密钥",
|
||||
|
||||
@@ -24,6 +24,7 @@ function Upload({
|
||||
const [docName, setDocName] = useState('');
|
||||
const [urlName, setUrlName] = useState('');
|
||||
const [url, setUrl] = useState('');
|
||||
const [repoUrl, setRepoUrl] = useState(''); // P3f93
|
||||
const [redditData, setRedditData] = useState({
|
||||
client_id: '',
|
||||
client_secret: '',
|
||||
@@ -48,6 +49,7 @@ function Upload({
|
||||
// { label: 'Sitemap', value: 'sitemap' },
|
||||
{ label: 'Link', value: 'url' },
|
||||
{ label: 'Reddit', value: 'reddit' },
|
||||
{ label: 'GitHub', value: 'github' }, // P3f93
|
||||
];
|
||||
|
||||
const [urlType, setUrlType] = useState<{ label: string; value: string }>({
|
||||
@@ -238,6 +240,9 @@ function Upload({
|
||||
formData.set('name', 'other');
|
||||
formData.set('data', JSON.stringify(redditData));
|
||||
}
|
||||
if (urlType.value === 'github') {
|
||||
formData.append('repo_url', repoUrl); // Pdeac
|
||||
}
|
||||
const apiHost = import.meta.env.VITE_API_HOST;
|
||||
const xhr = new XMLHttpRequest();
|
||||
xhr.upload.addEventListener('progress', (event) => {
|
||||
@@ -376,7 +381,7 @@ function Upload({
|
||||
size="w-full"
|
||||
rounded="3xl"
|
||||
/>
|
||||
{urlType.label !== 'Reddit' ? (
|
||||
{urlType.label !== 'Reddit' && urlType.label !== 'GitHub' ? (
|
||||
<>
|
||||
<Input
|
||||
placeholder={`Enter ${t('modals.uploadDoc.name')}`}
|
||||
@@ -403,6 +408,33 @@ function Upload({
|
||||
</span>
|
||||
</div>
|
||||
</>
|
||||
) : urlType.label === 'GitHub' ? ( // P3f93
|
||||
<>
|
||||
<Input
|
||||
placeholder={`Enter ${t('modals.uploadDoc.name')}`}
|
||||
type="text"
|
||||
value={urlName}
|
||||
onChange={(e) => setUrlName(e.target.value)}
|
||||
borderVariant="thin"
|
||||
></Input>
|
||||
<div className="relative bottom-12 left-2 mt-[-20px]">
|
||||
<span className="bg-white px-2 text-xs text-gray-4000 dark:bg-outer-space dark:text-silver">
|
||||
{t('modals.uploadDoc.name')}
|
||||
</span>
|
||||
</div>
|
||||
<Input
|
||||
placeholder={t('modals.uploadDoc.repoUrl')}
|
||||
type="text"
|
||||
value={repoUrl}
|
||||
onChange={(e) => setRepoUrl(e.target.value)}
|
||||
borderVariant="thin"
|
||||
></Input>
|
||||
<div className="relative bottom-12 left-2 mt-[-20px]">
|
||||
<span className="bg-white px-2 text-xs text-gray-4000 dark:bg-outer-space dark:text-silver">
|
||||
{t('modals.uploadDoc.repoUrl')}
|
||||
</span>
|
||||
</div>
|
||||
</>
|
||||
) : (
|
||||
<div className="flex flex-col gap-1 mt-2">
|
||||
<div>
|
||||
|
||||
Reference in New Issue
Block a user