From 2989be47ccdd1f46e5f0324caf6eb015c0ee4743 Mon Sep 17 00:00:00 2001 From: YASH <139299779+Yash-2707@users.noreply.github.com> Date: Wed, 2 Oct 2024 22:38:43 +0530 Subject: [PATCH 01/13] Update migrate_to_v1_vectorstore.py Enhancement made in the code by error handling and logging batch processing and new functionalities like backup , progress tracking --- scripts/migrate_to_v1_vectorstore.py | 72 ++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 19 deletions(-) diff --git a/scripts/migrate_to_v1_vectorstore.py b/scripts/migrate_to_v1_vectorstore.py index 9a709795..da83c4c1 100644 --- a/scripts/migrate_to_v1_vectorstore.py +++ b/scripts/migrate_to_v1_vectorstore.py @@ -1,13 +1,35 @@ import pymongo import os +import shutil +import logging +from tqdm import tqdm + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger() + +# Configuration +MONGO_URI = "mongodb://localhost:27017/" +MONGO_ATLAS_URI = "mongodb+srv://:@/?retryWrites=true&w=majority" +DB_NAME = "docsgpt" + +def backup_collection(collection, backup_collection_name): + logger.info(f"Backing up collection {collection.name} to {backup_collection_name}") + collection.aggregate([{"$out": backup_collection_name}]) + logger.info("Backup completed") def migrate_to_v1_vectorstore_mongo(): - client = pymongo.MongoClient("mongodb://localhost:27017/") - db = client["docsgpt"] + client = pymongo.MongoClient(MONGO_URI) + db = client[DB_NAME] vectors_collection = db["vectors"] sources_collection = db["sources"] - for vector in vectors_collection.find(): + # Backup collections before migration + backup_collection(vectors_collection, "vectors_backup") + backup_collection(sources_collection, "sources_backup") + + vectors = list(vectors_collection.find()) + for vector in tqdm(vectors, desc="Updating vectors"): if "location" in vector: del vector["location"] if "retriever" not in vector: @@ -15,41 +37,53 @@ def migrate_to_v1_vectorstore_mongo(): vector["remote_data"] = None vectors_collection.update_one({"_id": vector["_id"]}, {"$set": vector}) - # move data from vectors_collection to sources_collection - for vector in vectors_collection.find(): + # Move data from vectors_collection to sources_collection + for vector in tqdm(vectors, desc="Moving to sources"): sources_collection.insert_one(vector) vectors_collection.drop() - client.close() + logger.info("Migration completed") def migrate_faiss_to_v1_vectorstore(): - client = pymongo.MongoClient("mongodb://localhost:27017/") - db = client["docsgpt"] + client = pymongo.MongoClient(MONGO_URI) + db = client[DB_NAME] vectors_collection = db["vectors"] - for vector in vectors_collection.find(): + vectors = list(vectors_collection.find()) + for vector in tqdm(vectors, desc="Migrating FAISS vectors"): old_path = f"./application/indexes/{vector['user']}/{vector['name']}" new_path = f"./application/indexes/{vector['_id']}" try: - os.rename(old_path, new_path) + os.makedirs(os.path.dirname(new_path), exist_ok=True) + shutil.move(old_path, new_path) except OSError as e: - print(f"Error moving {old_path} to {new_path}: {e}") + logger.error(f"Error moving {old_path} to {new_path}: {e}") client.close() + logger.info("FAISS migration completed") def migrate_mongo_atlas_vector_to_v1_vectorstore(): - client = pymongo.MongoClient("mongodb+srv://:@/?retryWrites=true&w=majority") - db = client["docsgpt"] + client = pymongo.MongoClient(MONGO_ATLAS_URI) + db = client[DB_NAME] vectors_collection = db["vectors"] - - # mongodb atlas collection documents_collection = db["documents"] - for vector in vectors_collection.find(): - documents_collection.update_many({"store": vector["user"] + "/" + vector["name"]}, {"$set": {"source_id": str(vector["_id"])}}) + # Backup collections before migration + backup_collection(vectors_collection, "vectors_backup") + backup_collection(documents_collection, "documents_backup") + + vectors = list(vectors_collection.find()) + for vector in tqdm(vectors, desc="Updating Mongo Atlas vectors"): + documents_collection.update_many( + {"store": vector["user"] + "/" + vector["name"]}, + {"$set": {"source_id": str(vector["_id"])}} + ) client.close() + logger.info("Mongo Atlas migration completed") -migrate_faiss_to_v1_vectorstore() -migrate_to_v1_vectorstore_mongo() \ No newline at end of file +if __name__ == "__main__": + migrate_faiss_to_v1_vectorstore() + migrate_to_v1_vectorstore_mongo() + migrate_mongo_atlas_vector_to_v1_vectorstore() From 2611550ffd849e12568379cba230ab9bd947be67 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 2 Oct 2024 23:44:29 +0100 Subject: [PATCH 02/13] --- application/api/user/routes.py | 12 ++++- application/parser/remote/github_loader.py | 49 +++++++++++++++++++++ application/parser/remote/remote_creator.py | 2 + frontend/src/upload/Upload.tsx | 34 +++++++++++++- 4 files changed, 94 insertions(+), 3 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 340d020a..c409e69a 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -363,6 +363,7 @@ class UploadRemote(Resource): ), "name": fields.String(required=True, description="Job name"), "data": fields.String(required=True, description="Data to process"), + "repo_url": fields.String(description="GitHub repository URL"), }, ) ) @@ -377,11 +378,18 @@ class UploadRemote(Resource): return missing_fields try: + if "repo_url" in data: + source_data = data["repo_url"] + loader = "github" + else: + source_data = data["data"] + loader = data["source"] + task = ingest_remote.delay( - source_data=data["data"], + source_data=source_data, job_name=data["name"], user=data["user"], - loader=data["source"], + loader=loader, ) except Exception as err: return make_response(jsonify({"success": False, "error": str(err)}), 400) diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py index e69de29b..2839f48d 100644 --- a/application/parser/remote/github_loader.py +++ b/application/parser/remote/github_loader.py @@ -0,0 +1,49 @@ +import os +import base64 +import requests +from typing import List +from application.parser.remote.base import BaseRemote +from application.parser.schema.base import Document + +class GitHubLoader(BaseRemote): + def __init__(self, access_token: str): + self.access_token = access_token + + def fetch_file_content(self, repo_url: str, file_path: str) -> str: + url = f"https://api.github.com/repos/{repo_url}/contents/{file_path}" + headers = { + "Authorization": f"token {self.access_token}", + "Accept": "application/vnd.github.v3.raw" + } + response = requests.get(url, headers=headers) + response.raise_for_status() + content = response.json() + if content.get("encoding") == "base64": + return base64.b64decode(content["content"]).decode("utf-8") + return content["content"] + + def fetch_repo_files(self, repo_url: str, path: str = "") -> List[str]: + url = f"https://api.github.com/repos/{repo_url}/contents/{path}" + headers = { + "Authorization": f"token {self.access_token}", + "Accept": "application/vnd.github.v3.raw" + } + response = requests.get(url, headers=headers) + response.raise_for_status() + contents = response.json() + files = [] + for item in contents: + if item["type"] == "file": + files.append(item["path"]) + elif item["type"] == "dir": + files.extend(self.fetch_repo_files(repo_url, item["path"])) + return files + + def load_data(self, repo_url: str) -> List[Document]: + repo_name = repo_url.split("github.com/")[-1] + files = self.fetch_repo_files(repo_name) + documents = [] + for file_path in files: + content = self.fetch_file_content(repo_name, file_path) + documents.append(Document(content=content, metadata={"file_path": file_path})) + return documents diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py index d2a58f8d..026abd76 100644 --- a/application/parser/remote/remote_creator.py +++ b/application/parser/remote/remote_creator.py @@ -2,6 +2,7 @@ from application.parser.remote.sitemap_loader import SitemapLoader from application.parser.remote.crawler_loader import CrawlerLoader from application.parser.remote.web_loader import WebLoader from application.parser.remote.reddit_loader import RedditPostsLoaderRemote +from application.parser.remote.github_loader import GitHubLoader class RemoteCreator: @@ -10,6 +11,7 @@ class RemoteCreator: "sitemap": SitemapLoader, "crawler": CrawlerLoader, "reddit": RedditPostsLoaderRemote, + "github": GitHubLoader, } @classmethod diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index b898e4b6..50a6d357 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -24,6 +24,7 @@ function Upload({ const [docName, setDocName] = useState(''); const [urlName, setUrlName] = useState(''); const [url, setUrl] = useState(''); + const [repoUrl, setRepoUrl] = useState(''); // P3f93 const [redditData, setRedditData] = useState({ client_id: '', client_secret: '', @@ -48,6 +49,7 @@ function Upload({ // { label: 'Sitemap', value: 'sitemap' }, { label: 'Link', value: 'url' }, { label: 'Reddit', value: 'reddit' }, + { label: 'GitHub', value: 'github' }, // P3f93 ]; const [urlType, setUrlType] = useState<{ label: string; value: string }>({ @@ -238,6 +240,9 @@ function Upload({ formData.set('name', 'other'); formData.set('data', JSON.stringify(redditData)); } + if (urlType.value === 'github') { + formData.append('repo_url', repoUrl); // Pdeac + } const apiHost = import.meta.env.VITE_API_HOST; const xhr = new XMLHttpRequest(); xhr.upload.addEventListener('progress', (event) => { @@ -376,7 +381,7 @@ function Upload({ size="w-full" rounded="3xl" /> - {urlType.label !== 'Reddit' ? ( + {urlType.label !== 'Reddit' && urlType.label !== 'GitHub' ? ( <> + ) : urlType.label === 'GitHub' ? ( // P3f93 + <> + setUrlName(e.target.value)} + borderVariant="thin" + > +
+ + {t('modals.uploadDoc.name')} + +
+ setRepoUrl(e.target.value)} + borderVariant="thin" + > +
+ + {t('modals.uploadDoc.repoUrl')} + +
+ ) : (
From 1616124fa20efe656cf5bfa25983cd01ff9a3100 Mon Sep 17 00:00:00 2001 From: negativenagesh Date: Thu, 3 Oct 2024 16:22:54 +0530 Subject: [PATCH 03/13] Documentation error in Hacktoberfest.md --- HACKTOBERFEST.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HACKTOBERFEST.md b/HACKTOBERFEST.md index 631f73ba..47679960 100644 --- a/HACKTOBERFEST.md +++ b/HACKTOBERFEST.md @@ -7,7 +7,7 @@ All contributors with accepted PRs will receive a cool Holopin! 🤩 (Watch out ### 🏆 Top 50 contributors will recieve a special T-shirt ### 🏆 [LLM Document analysis by LexEU competition](https://github.com/arc53/DocsGPT/blob/main/lexeu-competition.md): -A separate competition is available for those who sumbit new retrieval / workflow method that will analyze a Document using EU laws. +A separate competition is available for those who submit new retrieval / workflow method that will analyze a Document using EU laws. With 200$, 100$, 50$ prize for 1st, 2nd and 3rd place respectively. You can find more information [here](https://github.com/arc53/DocsGPT/blob/main/lexeu-competition.md) From 03adfd4898d78ba9d68db17fd23bae90dd2cafc4 Mon Sep 17 00:00:00 2001 From: JeevaRamanathan Date: Thu, 3 Oct 2024 23:34:56 +0530 Subject: [PATCH 04/13] fix:navigation and deletion issues in conversations Signed-off-by: JeevaRamanathan --- frontend/src/Navigation.tsx | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/frontend/src/Navigation.tsx b/frontend/src/Navigation.tsx index 87cbbe51..7d3333ee 100644 --- a/frontend/src/Navigation.tsx +++ b/frontend/src/Navigation.tsx @@ -119,6 +119,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) { .delete(id, {}) .then(() => { fetchConversations(); + resetConversation(); }) .catch((error) => console.error(error)); }; @@ -155,6 +156,15 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) { }); }; + const resetConversation = () => { + dispatch(setConversation([])); + dispatch( + updateConversationId({ + query: { conversationId: null }, + }), + ); + }; + async function updateConversationName(updatedConversation: { name: string; id: string; @@ -235,14 +245,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
{ - dispatch(setConversation([])); - dispatch( - updateConversationId({ - query: { conversationId: null }, - }), - ); - }} + onClick={resetConversation} className={({ isActive }) => `${ isActive ? 'bg-gray-3000 dark:bg-transparent' : '' @@ -310,6 +313,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) { isActive ? 'bg-gray-3000 dark:bg-transparent' : '' }` } + onClick={resetConversation} > Date: Fri, 4 Oct 2024 23:06:25 +0300 Subject: [PATCH 05/13] fix: folloup cards borders --- frontend/src/Hero.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/Hero.tsx b/frontend/src/Hero.tsx index 04da8769..644848dc 100644 --- a/frontend/src/Hero.tsx +++ b/frontend/src/Hero.tsx @@ -37,7 +37,7 @@ export default function Hero({