From a353e696486463ac808b299926541e4de1b23939 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 8 Sep 2024 16:59:51 +0100 Subject: [PATCH] feat: new vectors structure --- application/api/internal/routes.py | 21 +++++------ application/api/user/routes.py | 28 +++------------ application/retriever/classic_rag.py | 10 +----- application/vectorstore/elasticsearch.py | 1 - application/vectorstore/faiss.py | 12 ++++++- application/worker.py | 15 +++++--- frontend/src/Navigation.tsx | 4 +-- frontend/src/components/SourceDropdown.tsx | 3 -- .../src/conversation/conversationHandlers.ts | 6 ++-- .../src/modals/ShareConversationModal.tsx | 20 +---------- frontend/src/preferences/preferenceApi.ts | 18 +++------- frontend/src/preferences/preferenceSlice.ts | 8 ++--- frontend/src/settings/index.tsx | 3 +- scripts/migrate_to_v1_vectorstore.py | 35 +++++++++++++++++++ 14 files changed, 85 insertions(+), 99 deletions(-) create mode 100644 scripts/migrate_to_v1_vectorstore.py diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index f4203822..f6eef4c4 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -35,12 +35,12 @@ def upload_index_files(): return {"status": "no name"} job_name = secure_filename(request.form["name"]) tokens = secure_filename(request.form["tokens"]) - """" - ObjectId serves as a dir name in application/indexes, - and for indexing the vector metadata in the collection - """ - _id = ObjectId() - save_dir = os.path.join(current_dir, "indexes", str(_id)) + retriever = secure_filename(request.form["retriever"]) + id = secure_filename(request.form["id"]) + type = secure_filename(request.form["type"]) + remote_data = secure_filename(request.form["remote_data"]) if "remote_data" in request.form else None + + save_dir = os.path.join(current_dir, "indexes", str(id)) if settings.VECTOR_STORE == "faiss": if "file_faiss" not in request.files: print("No file part") @@ -63,15 +63,16 @@ def upload_index_files(): # create entry in vectors_collection vectors_collection.insert_one( { - "_id":_id, + "_id": ObjectId(id), "user": user, "name": job_name, "language": job_name, - "location": save_dir, "date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), "model": settings.EMBEDDINGS_NAME, - "type": "local", - "tokens": tokens + "type": type, + "tokens": tokens, + "retriever": retriever, + "remote_data": remote_data } ) return {"status": "ok"} \ No newline at end of file diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 7c6e979c..43e532e1 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -237,15 +237,11 @@ def combined_json(): data = [ { "name": "default", - "language": "default", - "version": "", - "description": "default", - "fullName": "default", "date": "default", - "docLink": "default", "model": settings.EMBEDDINGS_NAME, "location": "remote", "tokens": "", + "retriever": "classic", } ] # structure: name, language, version, description, fullName, date, docLink @@ -255,35 +251,22 @@ def combined_json(): { "id": str(index["_id"]), "name": index["name"], - "language": index["language"], - "version": "", - "description": index["name"], - "fullName": index["name"], "date": index["date"], - "docLink": index["location"], "model": settings.EMBEDDINGS_NAME, "location": "local", "tokens": index["tokens"] if ("tokens" in index.keys()) else "", + "retriever": index["retriever"] if ("retriever" in index.keys()) else "classic", } ) - if settings.VECTOR_STORE == "faiss": - data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json() - for index in data_remote: - index["location"] = "remote" - data.append(index) if "duckduck_search" in settings.RETRIEVERS_ENABLED: data.append( { "name": "DuckDuckGo Search", - "language": "en", - "version": "", - "description": "duckduck_search", - "fullName": "DuckDuckGo Search", "date": "duckduck_search", - "docLink": "duckduck_search", "model": settings.EMBEDDINGS_NAME, "location": "custom", "tokens": "", + "retriever": "duckduck_search", } ) if "brave_search" in settings.RETRIEVERS_ENABLED: @@ -291,14 +274,11 @@ def combined_json(): { "name": "Brave Search", "language": "en", - "version": "", - "description": "brave_search", - "fullName": "Brave Search", "date": "brave_search", - "docLink": "brave_search", "model": settings.EMBEDDINGS_NAME, "location": "custom", "tokens": "", + "retriever": "brave_search", } ) diff --git a/application/retriever/classic_rag.py b/application/retriever/classic_rag.py index 4a1aa5bc..810bb179 100644 --- a/application/retriever/classic_rag.py +++ b/application/retriever/classic_rag.py @@ -21,7 +21,7 @@ class ClassicRAG(BaseRetriever): user_api_key=None, ): self.question = question - self.vectorstore = self._get_vectorstore(source=source) + self.vectorstore = source['active_docs'] if 'active_docs' in source else None self.chat_history = chat_history self.prompt = prompt self.chunks = chunks @@ -38,14 +38,6 @@ class ClassicRAG(BaseRetriever): ) self.user_api_key = user_api_key - def _get_vectorstore(self, source): - if "active_docs" in source: - vectorstore = "indexes/"+source["active_docs"] - else: - vectorstore = "" - vectorstore = os.path.join("application", vectorstore) - return vectorstore - def _get_data(self): if self.chunks == 0: docs = [] diff --git a/application/vectorstore/elasticsearch.py b/application/vectorstore/elasticsearch.py index bb28d5ce..061292b0 100644 --- a/application/vectorstore/elasticsearch.py +++ b/application/vectorstore/elasticsearch.py @@ -210,4 +210,3 @@ class ElasticsearchStore(BaseVectorStore): def delete_index(self): self._es_connection.delete_by_query(index=self.index_name, query={"match": { "metadata.store.keyword": self.path}},) - diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index 8e8f3b8e..957e61ef 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -1,12 +1,22 @@ from langchain_community.vectorstores import FAISS from application.vectorstore.base import BaseVectorStore from application.core.settings import settings +import os + +def get_vectorstore(path): + if path: + vectorstore = "indexes/"+path + vectorstore = os.path.join("application", vectorstore) + else: + vectorstore = os.path.join("application") + + return vectorstore class FaissStore(BaseVectorStore): def __init__(self, path, embeddings_key, docs_init=None): super().__init__() - self.path = path + self.path = get_vectorstore(path) embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) if docs_init: self.docsearch = FAISS.from_documents( diff --git a/application/worker.py b/application/worker.py index b3258983..852d9785 100755 --- a/application/worker.py +++ b/application/worker.py @@ -6,6 +6,7 @@ import tiktoken from urllib.parse import urljoin import requests +from bson.objectid import ObjectId from application.core.settings import settings from application.parser.file.bulk import SimpleDirectoryReader @@ -57,7 +58,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5): # Define the main function for ingesting and processing documents. -def ingest_worker(self, directory, formats, name_job, filename, user): +def ingest_worker(self, directory, formats, name_job, filename, user, retriever="classic"): """ Ingest and process documents. @@ -68,6 +69,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): name_job (str): Name of the job for this ingestion task. filename (str): Name of the file to be ingested. user (str): Identifier for the user initiating the ingestion. + retriever (str): Type of retriever to use for processing the documents. Returns: dict: Information about the completed ingestion task, including input parameters and a "limited" flag. @@ -136,7 +138,8 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl # and send them to the server (provide user and name in form) - file_data = {"name": name_job, "user": user, "tokens": tokens} + id = ObjectId() + file_data = {"name": name_job, "user": user, "tokens": tokens, "retriever": retriever, "id": str(id), 'type': 'local'} if settings.VECTOR_STORE == "faiss": files = { "file_faiss": open(full_path + "/index.faiss", "rb"), @@ -160,7 +163,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): } -def remote_worker(self, source_data, name_job, user, loader, directory="temp"): +def remote_worker(self, source_data, name_job, user, loader, directory="temp", retriever="classic"): token_check = True min_tokens = 150 max_tokens = 1250 @@ -180,12 +183,14 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"): token_check=token_check, ) # docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] - call_openai_api(docs, full_path, self) tokens = count_tokens_docs(docs) + call_openai_api(docs, full_path, self) self.update_state(state="PROGRESS", meta={"current": 100}) # Proceed with uploading and cleaning as in the original function - file_data = {"name": name_job, "user": user, "tokens": tokens} + id = ObjectId() + file_data = {"name": name_job, "user": user, "tokens": tokens, "retriever": retriever, + "id": str(id), 'type': loader, 'remote_data': source_data} if settings.VECTOR_STORE == "faiss": files = { "file_faiss": open(full_path + "/index.faiss", "rb"), diff --git a/frontend/src/Navigation.tsx b/frontend/src/Navigation.tsx index cbfe5d95..b67d874b 100644 --- a/frontend/src/Navigation.tsx +++ b/frontend/src/Navigation.tsx @@ -124,10 +124,8 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) { }; const handleDeleteClick = (doc: Doc) => { - const docPath = `indexes/local/${doc.name}`; - userService - .deletePath(docPath) + .deletePath(doc.id ?? '') .then(() => { return getDocs(); }) diff --git a/frontend/src/components/SourceDropdown.tsx b/frontend/src/components/SourceDropdown.tsx index ce130b4d..e76b9664 100644 --- a/frontend/src/components/SourceDropdown.tsx +++ b/frontend/src/components/SourceDropdown.tsx @@ -63,9 +63,6 @@ function SourceDropdown({

{selectedDocs?.name || 'None'}

-

- {selectedDocs?.version} -

{ @@ -87,7 +87,7 @@ export function handleFetchAnswerSteaming( }; if (selectedDocs && 'id' in selectedDocs) payload.active_docs = selectedDocs.id as string; - else payload.retriever = selectedDocs?.docLink as string; + payload.retriever = selectedDocs?.retriever as string; return new Promise((resolve, reject) => { conversationService @@ -160,7 +160,7 @@ export function handleSearch( }; if (selectedDocs && 'id' in selectedDocs) payload.active_docs = selectedDocs.id as string; - else payload.retriever = selectedDocs?.docLink as string; + payload.retriever = selectedDocs?.retriever as string; return conversationService .search(payload) .then((response) => response.json()) diff --git a/frontend/src/modals/ShareConversationModal.tsx b/frontend/src/modals/ShareConversationModal.tsx index c7ef0ad6..fbb49468 100644 --- a/frontend/src/modals/ShareConversationModal.tsx +++ b/frontend/src/modals/ShareConversationModal.tsx @@ -46,27 +46,9 @@ export const ShareConversationModal = ({ ? docs .filter((doc) => doc.model === embeddingsName) .map((doc: Doc) => { - let namePath = doc.name; - if (doc.language === namePath) { - namePath = '.project'; - } - let docPath = 'default'; - if (doc.location === 'local') { - docPath = 'local' + '/' + doc.name + '/'; - } else if (doc.location === 'remote') { - docPath = - doc.language + - '/' + - namePath + - '/' + - doc.version + - '/' + - doc.model + - '/'; - } return { label: doc.name, - value: docPath, + value: doc.id ?? 'default', }; }) : []; diff --git a/frontend/src/preferences/preferenceApi.ts b/frontend/src/preferences/preferenceApi.ts index 29a41645..96f87e1d 100644 --- a/frontend/src/preferences/preferenceApi.ts +++ b/frontend/src/preferences/preferenceApi.ts @@ -3,15 +3,12 @@ import userService from '../api/services/userService'; // not all properties in Doc are going to be present. Make some optional export type Doc = { - location: string; + id: string | null; name: string; - language: string; - version: string; - description: string; - fullName: string; + type: string; date: string; - docLink: string; model: string; + retriever: string; }; //Fetches all JSON objects from the source. We only use the objects with the "model" property in SelectDocsModal.tsx. Hopefully can clean up the source file later. @@ -78,17 +75,10 @@ export function setLocalPrompt(prompt: string): void { export function setLocalRecentDocs(doc: Doc): void { localStorage.setItem('DocsGPTRecentDocs', JSON.stringify(doc)); - let namePath = doc.name; - if (doc.language === namePath) { - namePath = '.project'; - } let docPath = 'default'; - if (doc.location === 'local') { + if (doc.type === 'local') { docPath = 'local' + '/' + doc.name + '/'; - } else if (doc.location === 'remote') { - docPath = - doc.language + '/' + namePath + '/' + doc.version + '/' + doc.model + '/'; } userService .checkDocs({ diff --git a/frontend/src/preferences/preferenceSlice.ts b/frontend/src/preferences/preferenceSlice.ts index 370f260e..45e55d3f 100644 --- a/frontend/src/preferences/preferenceSlice.ts +++ b/frontend/src/preferences/preferenceSlice.ts @@ -25,15 +25,13 @@ const initialState: Preference = { chunks: '2', token_limit: 2000, selectedDocs: { + id: 'default', name: 'default', - language: 'default', - location: 'default', - version: 'default', - description: 'default', - fullName: 'default', + type: 'remote', date: 'default', docLink: 'default', model: 'openai_text-embedding-ada-002', + retriever: 'classic', } as Doc, sourceDocs: null, conversations: null, diff --git a/frontend/src/settings/index.tsx b/frontend/src/settings/index.tsx index 226ebb3b..141bd227 100644 --- a/frontend/src/settings/index.tsx +++ b/frontend/src/settings/index.tsx @@ -35,9 +35,8 @@ export default function Settings() { }; const handleDeleteClick = (index: number, doc: Doc) => { - const docPath = 'indexes/' + 'local' + '/' + doc.name; userService - .deletePath(docPath) + .deletePath(doc.id ?? '') .then((response) => { if (response.ok && documents) { const updatedDocuments = [ diff --git a/scripts/migrate_to_v1_vectorstore.py b/scripts/migrate_to_v1_vectorstore.py new file mode 100644 index 00000000..5255d222 --- /dev/null +++ b/scripts/migrate_to_v1_vectorstore.py @@ -0,0 +1,35 @@ +import pymongo +import os + +def migrate_to_v1_vectorstore_mongo(): + client = pymongo.MongoClient("mongodb://localhost:27017/") + db = client["docsgpt"] + vectors_collection = db["vectors"] + + for vector in vectors_collection.find(): + if "location" in vector: + del vector["location"] + if "retriever" not in vector: + vector["retriever"] = "classic" + vector["remote_data"] = None + vectors_collection.update_one({"_id": vector["_id"]}, {"$set": vector}) + + client.close() + +def migrate_faiss_to_v1_vectorstore(): + client = pymongo.MongoClient("mongodb://localhost:27017/") + db = client["docsgpt"] + vectors_collection = db["vectors"] + + for vector in vectors_collection.find(): + old_path = f"./application/indexes/{vector['user']}/{vector['name']}" + new_path = f"./application/indexes/{vector['_id']}" + try: + os.rename(old_path, new_path) + except OSError as e: + print(f"Error moving {old_path} to {new_path}: {e}") + + client.close() + +migrate_faiss_to_v1_vectorstore() +migrate_to_v1_vectorstore_mongo() \ No newline at end of file