From a353e696486463ac808b299926541e4de1b23939 Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Sun, 8 Sep 2024 16:59:51 +0100
Subject: [PATCH] feat: new vectors structure

---
 application/api/internal/routes.py            | 21 +++++------
 application/api/user/routes.py                | 28 +++------------
 application/retriever/classic_rag.py          | 10 +-----
 application/vectorstore/elasticsearch.py      |  1 -
 application/vectorstore/faiss.py              | 12 ++++++-
 application/worker.py                         | 15 +++++---
 frontend/src/Navigation.tsx                   |  4 +--
 frontend/src/components/SourceDropdown.tsx    |  3 --
 .../src/conversation/conversationHandlers.ts  |  6 ++--
 .../src/modals/ShareConversationModal.tsx     | 20 +----------
 frontend/src/preferences/preferenceApi.ts     | 18 +++-------
 frontend/src/preferences/preferenceSlice.ts   |  8 ++---
 frontend/src/settings/index.tsx               |  3 +-
 scripts/migrate_to_v1_vectorstore.py          | 35 +++++++++++++++++++
 14 files changed, 85 insertions(+), 99 deletions(-)
 create mode 100644 scripts/migrate_to_v1_vectorstore.py

diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py
index f4203822..f6eef4c4 100755
--- a/application/api/internal/routes.py
+++ b/application/api/internal/routes.py
@@ -35,12 +35,12 @@ def upload_index_files():
         return {"status": "no name"}
     job_name = secure_filename(request.form["name"])
     tokens = secure_filename(request.form["tokens"])
-    """"
-    ObjectId serves as a dir name in application/indexes, 
-    and for indexing the vector metadata in the collection
-    """
-    _id = ObjectId()
-    save_dir = os.path.join(current_dir, "indexes", str(_id))
+    retriever = secure_filename(request.form["retriever"])
+    id = secure_filename(request.form["id"])
+    type = secure_filename(request.form["type"])
+    remote_data = secure_filename(request.form["remote_data"]) if "remote_data" in  request.form else None
+
+    save_dir = os.path.join(current_dir, "indexes", str(id))
     if settings.VECTOR_STORE == "faiss":
         if "file_faiss" not in request.files:
             print("No file part")
@@ -63,15 +63,16 @@ def upload_index_files():
     # create entry in vectors_collection
     vectors_collection.insert_one(
         {
-            "_id":_id,
+            "_id": ObjectId(id),
             "user": user,
             "name": job_name,
             "language": job_name,
-            "location": save_dir,
             "date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
             "model": settings.EMBEDDINGS_NAME,
-            "type": "local",
-            "tokens": tokens
+            "type": type,
+            "tokens": tokens,
+            "retriever": retriever,
+            "remote_data": remote_data
         }
     )
     return {"status": "ok"}
\ No newline at end of file
diff --git a/application/api/user/routes.py b/application/api/user/routes.py
index 7c6e979c..43e532e1 100644
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -237,15 +237,11 @@ def combined_json():
     data = [
         {
             "name": "default",
-            "language": "default",
-            "version": "",
-            "description": "default",
-            "fullName": "default",
             "date": "default",
-            "docLink": "default",
             "model": settings.EMBEDDINGS_NAME,
             "location": "remote",
             "tokens": "",
+            "retriever": "classic",
         }
     ]
     # structure: name, language, version, description, fullName, date, docLink
@@ -255,35 +251,22 @@ def combined_json():
             {
                 "id": str(index["_id"]),
                 "name": index["name"],
-                "language": index["language"],
-                "version": "",
-                "description": index["name"],
-                "fullName": index["name"],
                 "date": index["date"],
-                "docLink": index["location"],
                 "model": settings.EMBEDDINGS_NAME,
                 "location": "local",
                 "tokens": index["tokens"] if ("tokens" in index.keys()) else "",
+                "retriever": index["retriever"] if ("retriever" in index.keys()) else "classic",
             }
         )
-    if settings.VECTOR_STORE == "faiss":
-        data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json()
-        for index in data_remote:
-            index["location"] = "remote"
-            data.append(index)
     if "duckduck_search" in settings.RETRIEVERS_ENABLED:
         data.append(
             {
                 "name": "DuckDuckGo Search",
-                "language": "en",
-                "version": "",
-                "description": "duckduck_search",
-                "fullName": "DuckDuckGo Search",
                 "date": "duckduck_search",
-                "docLink": "duckduck_search",
                 "model": settings.EMBEDDINGS_NAME,
                 "location": "custom",
                 "tokens": "",
+                "retriever": "duckduck_search",
             }
         )
     if "brave_search" in settings.RETRIEVERS_ENABLED:
@@ -291,14 +274,11 @@ def combined_json():
             {
                 "name": "Brave Search",
                 "language": "en",
-                "version": "",
-                "description": "brave_search",
-                "fullName": "Brave Search",
                 "date": "brave_search",
-                "docLink": "brave_search",
                 "model": settings.EMBEDDINGS_NAME,
                 "location": "custom",
                 "tokens": "",
+                "retriever": "brave_search",
             }
         )
 
diff --git a/application/retriever/classic_rag.py b/application/retriever/classic_rag.py
index 4a1aa5bc..810bb179 100644
--- a/application/retriever/classic_rag.py
+++ b/application/retriever/classic_rag.py
@@ -21,7 +21,7 @@ class ClassicRAG(BaseRetriever):
         user_api_key=None,
     ):
         self.question = question
-        self.vectorstore = self._get_vectorstore(source=source)
+        self.vectorstore = source['active_docs'] if 'active_docs' in source else None
         self.chat_history = chat_history
         self.prompt = prompt
         self.chunks = chunks
@@ -38,14 +38,6 @@ class ClassicRAG(BaseRetriever):
         )
         self.user_api_key = user_api_key
 
-    def _get_vectorstore(self, source):
-        if "active_docs" in source:
-            vectorstore = "indexes/"+source["active_docs"]
-        else:
-            vectorstore = ""
-        vectorstore = os.path.join("application", vectorstore)
-        return vectorstore
-
     def _get_data(self):
         if self.chunks == 0:
             docs = []
diff --git a/application/vectorstore/elasticsearch.py b/application/vectorstore/elasticsearch.py
index bb28d5ce..061292b0 100644
--- a/application/vectorstore/elasticsearch.py
+++ b/application/vectorstore/elasticsearch.py
@@ -210,4 +210,3 @@ class ElasticsearchStore(BaseVectorStore):
     def delete_index(self):
         self._es_connection.delete_by_query(index=self.index_name, query={"match": {
                                       "metadata.store.keyword": self.path}},)
-
diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py
index 8e8f3b8e..957e61ef 100644
--- a/application/vectorstore/faiss.py
+++ b/application/vectorstore/faiss.py
@@ -1,12 +1,22 @@
 from langchain_community.vectorstores import FAISS
 from application.vectorstore.base import BaseVectorStore
 from application.core.settings import settings
+import os
+
+def get_vectorstore(path):
+    if path:
+        vectorstore = "indexes/"+path
+        vectorstore = os.path.join("application", vectorstore)
+    else:
+        vectorstore = os.path.join("application")
+
+    return vectorstore
 
 class FaissStore(BaseVectorStore):
 
     def __init__(self, path, embeddings_key, docs_init=None):
         super().__init__()
-        self.path = path
+        self.path = get_vectorstore(path)
         embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
         if docs_init:
             self.docsearch = FAISS.from_documents(
diff --git a/application/worker.py b/application/worker.py
index b3258983..852d9785 100755
--- a/application/worker.py
+++ b/application/worker.py
@@ -6,6 +6,7 @@ import tiktoken
 from urllib.parse import urljoin
 
 import requests
+from bson.objectid import ObjectId
 
 from application.core.settings import settings
 from application.parser.file.bulk import SimpleDirectoryReader
@@ -57,7 +58,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
 
 
 # Define the main function for ingesting and processing documents.
-def ingest_worker(self, directory, formats, name_job, filename, user):
+def ingest_worker(self, directory, formats, name_job, filename, user, retriever="classic"):
     """
     Ingest and process documents.
 
@@ -68,6 +69,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
         name_job (str): Name of the job for this ingestion task.
         filename (str): Name of the file to be ingested.
         user (str): Identifier for the user initiating the ingestion.
+        retriever (str): Type of retriever to use for processing the documents.
 
     Returns:
         dict: Information about the completed ingestion task, including input parameters and a "limited" flag.
@@ -136,7 +138,8 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
 
     # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
     # and send them to the server (provide user and name in form)
-    file_data = {"name": name_job, "user": user, "tokens": tokens}
+    id = ObjectId()
+    file_data = {"name": name_job, "user": user, "tokens": tokens, "retriever": retriever, "id": str(id), 'type': 'local'}
     if settings.VECTOR_STORE == "faiss":
         files = {
             "file_faiss": open(full_path + "/index.faiss", "rb"),
@@ -160,7 +163,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
     }
 
 
-def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
+def remote_worker(self, source_data, name_job, user, loader, directory="temp", retriever="classic"):
     token_check = True
     min_tokens = 150
     max_tokens = 1250
@@ -180,12 +183,14 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
         token_check=token_check,
     )
     # docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
-    call_openai_api(docs, full_path, self)
     tokens = count_tokens_docs(docs)
+    call_openai_api(docs, full_path, self)
     self.update_state(state="PROGRESS", meta={"current": 100})
 
     # Proceed with uploading and cleaning as in the original function
-    file_data = {"name": name_job, "user": user, "tokens": tokens}
+    id = ObjectId()
+    file_data = {"name": name_job, "user": user, "tokens": tokens, "retriever": retriever, 
+                 "id": str(id), 'type': loader, 'remote_data': source_data}
     if settings.VECTOR_STORE == "faiss":
         files = {
             "file_faiss": open(full_path + "/index.faiss", "rb"),
diff --git a/frontend/src/Navigation.tsx b/frontend/src/Navigation.tsx
index cbfe5d95..b67d874b 100644
--- a/frontend/src/Navigation.tsx
+++ b/frontend/src/Navigation.tsx
@@ -124,10 +124,8 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
   };
 
   const handleDeleteClick = (doc: Doc) => {
-    const docPath = `indexes/local/${doc.name}`;
-
     userService
-      .deletePath(docPath)
+      .deletePath(doc.id ?? '')
       .then(() => {
         return getDocs();
       })
diff --git a/frontend/src/components/SourceDropdown.tsx b/frontend/src/components/SourceDropdown.tsx
index ce130b4d..e76b9664 100644
--- a/frontend/src/components/SourceDropdown.tsx
+++ b/frontend/src/components/SourceDropdown.tsx
@@ -63,9 +63,6 @@ function SourceDropdown({
             <p className="max-w-3/4 truncate whitespace-nowrap">
               {selectedDocs?.name || 'None'}
             </p>
-            <p className="flex flex-col items-center justify-center">
-              {selectedDocs?.version}
-            </p>
           </div>
         </span>
         <img
diff --git a/frontend/src/conversation/conversationHandlers.ts b/frontend/src/conversation/conversationHandlers.ts
index 9e3d5d2c..eaea1b4f 100644
--- a/frontend/src/conversation/conversationHandlers.ts
+++ b/frontend/src/conversation/conversationHandlers.ts
@@ -41,7 +41,7 @@ export function handleFetchAnswer(
   };
   if (selectedDocs && 'id' in selectedDocs)
     payload.active_docs = selectedDocs.id as string;
-  else payload.retriever = selectedDocs?.docLink as string;
+  payload.retriever = selectedDocs?.retriever as string;
   return conversationService
     .answer(payload, signal)
     .then((response) => {
@@ -87,7 +87,7 @@ export function handleFetchAnswerSteaming(
   };
   if (selectedDocs && 'id' in selectedDocs)
     payload.active_docs = selectedDocs.id as string;
-  else payload.retriever = selectedDocs?.docLink as string;
+  payload.retriever = selectedDocs?.retriever as string;
 
   return new Promise<Answer>((resolve, reject) => {
     conversationService
@@ -160,7 +160,7 @@ export function handleSearch(
   };
   if (selectedDocs && 'id' in selectedDocs)
     payload.active_docs = selectedDocs.id as string;
-  else payload.retriever = selectedDocs?.docLink as string;
+  payload.retriever = selectedDocs?.retriever as string;
   return conversationService
     .search(payload)
     .then((response) => response.json())
diff --git a/frontend/src/modals/ShareConversationModal.tsx b/frontend/src/modals/ShareConversationModal.tsx
index c7ef0ad6..fbb49468 100644
--- a/frontend/src/modals/ShareConversationModal.tsx
+++ b/frontend/src/modals/ShareConversationModal.tsx
@@ -46,27 +46,9 @@ export const ShareConversationModal = ({
       ? docs
           .filter((doc) => doc.model === embeddingsName)
           .map((doc: Doc) => {
-            let namePath = doc.name;
-            if (doc.language === namePath) {
-              namePath = '.project';
-            }
-            let docPath = 'default';
-            if (doc.location === 'local') {
-              docPath = 'local' + '/' + doc.name + '/';
-            } else if (doc.location === 'remote') {
-              docPath =
-                doc.language +
-                '/' +
-                namePath +
-                '/' +
-                doc.version +
-                '/' +
-                doc.model +
-                '/';
-            }
             return {
               label: doc.name,
-              value: docPath,
+              value: doc.id ?? 'default',
             };
           })
       : [];
diff --git a/frontend/src/preferences/preferenceApi.ts b/frontend/src/preferences/preferenceApi.ts
index 29a41645..96f87e1d 100644
--- a/frontend/src/preferences/preferenceApi.ts
+++ b/frontend/src/preferences/preferenceApi.ts
@@ -3,15 +3,12 @@ import userService from '../api/services/userService';
 
 // not all properties in Doc are going to be present. Make some optional
 export type Doc = {
-  location: string;
+  id: string | null;
   name: string;
-  language: string;
-  version: string;
-  description: string;
-  fullName: string;
+  type: string;
   date: string;
-  docLink: string;
   model: string;
+  retriever: string;
 };
 
 //Fetches all JSON objects from the source. We only use the objects with the "model" property in SelectDocsModal.tsx. Hopefully can clean up the source file later.
@@ -78,17 +75,10 @@ export function setLocalPrompt(prompt: string): void {
 
 export function setLocalRecentDocs(doc: Doc): void {
   localStorage.setItem('DocsGPTRecentDocs', JSON.stringify(doc));
-  let namePath = doc.name;
-  if (doc.language === namePath) {
-    namePath = '.project';
-  }
 
   let docPath = 'default';
-  if (doc.location === 'local') {
+  if (doc.type === 'local') {
     docPath = 'local' + '/' + doc.name + '/';
-  } else if (doc.location === 'remote') {
-    docPath =
-      doc.language + '/' + namePath + '/' + doc.version + '/' + doc.model + '/';
   }
   userService
     .checkDocs({
diff --git a/frontend/src/preferences/preferenceSlice.ts b/frontend/src/preferences/preferenceSlice.ts
index 370f260e..45e55d3f 100644
--- a/frontend/src/preferences/preferenceSlice.ts
+++ b/frontend/src/preferences/preferenceSlice.ts
@@ -25,15 +25,13 @@ const initialState: Preference = {
   chunks: '2',
   token_limit: 2000,
   selectedDocs: {
+    id: 'default',
     name: 'default',
-    language: 'default',
-    location: 'default',
-    version: 'default',
-    description: 'default',
-    fullName: 'default',
+    type: 'remote',
     date: 'default',
     docLink: 'default',
     model: 'openai_text-embedding-ada-002',
+    retriever: 'classic',
   } as Doc,
   sourceDocs: null,
   conversations: null,
diff --git a/frontend/src/settings/index.tsx b/frontend/src/settings/index.tsx
index 226ebb3b..141bd227 100644
--- a/frontend/src/settings/index.tsx
+++ b/frontend/src/settings/index.tsx
@@ -35,9 +35,8 @@ export default function Settings() {
   };
 
   const handleDeleteClick = (index: number, doc: Doc) => {
-    const docPath = 'indexes/' + 'local' + '/' + doc.name;
     userService
-      .deletePath(docPath)
+      .deletePath(doc.id ?? '')
       .then((response) => {
         if (response.ok && documents) {
           const updatedDocuments = [
diff --git a/scripts/migrate_to_v1_vectorstore.py b/scripts/migrate_to_v1_vectorstore.py
new file mode 100644
index 00000000..5255d222
--- /dev/null
+++ b/scripts/migrate_to_v1_vectorstore.py
@@ -0,0 +1,35 @@
+import pymongo
+import os
+
+def migrate_to_v1_vectorstore_mongo():
+    client = pymongo.MongoClient("mongodb://localhost:27017/")
+    db = client["docsgpt"]
+    vectors_collection = db["vectors"]
+
+    for vector in vectors_collection.find():
+        if "location" in vector:
+            del vector["location"]
+        if "retriever" not in vector:
+            vector["retriever"] = "classic"
+            vector["remote_data"] = None
+        vectors_collection.update_one({"_id": vector["_id"]}, {"$set": vector})
+
+    client.close()
+
+def migrate_faiss_to_v1_vectorstore():
+    client = pymongo.MongoClient("mongodb://localhost:27017/")
+    db = client["docsgpt"]
+    vectors_collection = db["vectors"]
+
+    for vector in vectors_collection.find():
+        old_path = f"./application/indexes/{vector['user']}/{vector['name']}"
+        new_path = f"./application/indexes/{vector['_id']}"
+        try:
+            os.rename(old_path, new_path)
+        except OSError as e:
+            print(f"Error moving {old_path} to {new_path}: {e}")
+
+    client.close()
+
+migrate_faiss_to_v1_vectorstore()
+migrate_to_v1_vectorstore_mongo()
\ No newline at end of file