From 1eb168be55fdcd594c2548f9aa45c4617f778fd8 Mon Sep 17 00:00:00 2001
From: ManishMadan2882 <manishmadan321@gmail.com>
Date: Sun, 11 Aug 2024 19:33:31 +0530
Subject: [PATCH] vector indexes to be named after mongo _id

---
 application/api/internal/routes.py   | 10 +++++--
 application/api/user/routes.py       | 23 +++++++--------
 application/retriever/classic_rag.py |  9 +-----
 application/worker.py                | 43 +++++++++-------------------
 4 files changed, 34 insertions(+), 51 deletions(-)

diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py
index 6039ecdf..f4203822 100755
--- a/application/api/internal/routes.py
+++ b/application/api/internal/routes.py
@@ -3,7 +3,7 @@ import datetime
 from flask import Blueprint, request, send_from_directory
 from pymongo import MongoClient
 from werkzeug.utils import secure_filename
-
+from bson.objectid import ObjectId
 
 from application.core.settings import settings
 mongo = MongoClient(settings.MONGO_URI)
@@ -35,7 +35,12 @@ def upload_index_files():
         return {"status": "no name"}
     job_name = secure_filename(request.form["name"])
     tokens = secure_filename(request.form["tokens"])
-    save_dir = os.path.join(current_dir, "indexes", user, job_name)
+    """"
+    ObjectId serves as a dir name in application/indexes, 
+    and for indexing the vector metadata in the collection
+    """
+    _id = ObjectId()
+    save_dir = os.path.join(current_dir, "indexes", str(_id))
     if settings.VECTOR_STORE == "faiss":
         if "file_faiss" not in request.files:
             print("No file part")
@@ -58,6 +63,7 @@ def upload_index_files():
     # create entry in vectors_collection
     vectors_collection.insert_one(
         {
+            "_id":_id,
             "user": user,
             "name": job_name,
             "language": job_name,
diff --git a/application/api/user/routes.py b/application/api/user/routes.py
index aab30469..7ce0b2e2 100644
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -116,18 +116,17 @@ def delete_by_ids():
 def delete_old():
     """Delete old indexes."""
     import shutil
-
-    path = request.args.get("path")
-    dirs = path.split("/")
-    dirs_clean = []
-    for i in range(0, len(dirs)):
-        dirs_clean.append(secure_filename(dirs[i]))
-    # check that path strats with indexes or vectors
-
-    if dirs_clean[0] not in ["indexes", "vectors"]:
-        return {"status": "error"}
-    path_clean = "/".join(dirs_clean)
-    vectors_collection.delete_one({"name": dirs_clean[-1], "user": dirs_clean[-2]})
+    name = request.args.get("name")
+    user = request.args.get("user")
+    doc = vectors_collection.find_one({
+        "user":user,
+        "name":name
+    })
+    print("user",user)
+    print("file",name)
+    if(doc is None):
+        return {"status":"not found"},404
+    path_clean = doc["location"]
     if settings.VECTOR_STORE == "faiss":
         try:
             shutil.rmtree(os.path.join(current_dir, path_clean))
diff --git a/application/retriever/classic_rag.py b/application/retriever/classic_rag.py
index 2b77db34..4a1aa5bc 100644
--- a/application/retriever/classic_rag.py
+++ b/application/retriever/classic_rag.py
@@ -40,14 +40,7 @@ class ClassicRAG(BaseRetriever):
 
     def _get_vectorstore(self, source):
         if "active_docs" in source:
-            if source["active_docs"].split("/")[0] == "default":
-                vectorstore = ""
-            elif source["active_docs"].split("/")[0] == "local":
-                vectorstore = "indexes/" + source["active_docs"]
-            else:
-                vectorstore = "vectors/" + source["active_docs"]
-            if source["active_docs"] == "default":
-                vectorstore = ""
+            vectorstore = "indexes/"+source["active_docs"]
         else:
             vectorstore = ""
         vectorstore = os.path.join("application", vectorstore)
diff --git a/application/worker.py b/application/worker.py
index bd1bc15a..b3258983 100755
--- a/application/worker.py
+++ b/application/worker.py
@@ -14,6 +14,7 @@ from application.parser.open_ai_func import call_openai_api
 from application.parser.schema.base import Document
 from application.parser.token_func import group_split
 
+
 # Define a function to extract metadata from a given filename.
 def metadata_from_filename(title):
     store = "/".join(title.split("/")[1:3])
@@ -25,9 +26,7 @@ def generate_random_string(length):
     return "".join([string.ascii_letters[i % 52] for i in range(length)])
 
 
-current_dir = os.path.dirname(
-    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-)
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
 def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
@@ -93,9 +92,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
     print(full_path, file=sys.stderr)
     # check if API_URL env variable is set
     file_data = {"name": name_job, "file": filename, "user": user}
-    response = requests.get(
-        urljoin(settings.API_URL, "/api/download"), params=file_data
-    )
+    response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data)
     # check if file is in the response
     print(response, file=sys.stderr)
     file = response.content
@@ -107,9 +104,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
 
     # check if file is .zip and extract it
     if filename.endswith(".zip"):
-        extract_zip_recursive(
-            os.path.join(full_path, filename), full_path, 0, recursion_depth
-        )
+        extract_zip_recursive(os.path.join(full_path, filename), full_path, 0, recursion_depth)
 
     self.update_state(state="PROGRESS", meta={"current": 1})
 
@@ -141,22 +136,16 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
 
     # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
     # and send them to the server (provide user and name in form)
-    file_data = {"name": name_job, "user": user, "tokens":tokens}
+    file_data = {"name": name_job, "user": user, "tokens": tokens}
     if settings.VECTOR_STORE == "faiss":
         files = {
             "file_faiss": open(full_path + "/index.faiss", "rb"),
             "file_pkl": open(full_path + "/index.pkl", "rb"),
         }
-        response = requests.post(
-            urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
-        )
-        response = requests.get(
-            urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)
-        )
+        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
+        response = requests.get(urljoin(settings.API_URL, "/api/delete_old?name=" + name_job + "&?user=" + user))
     else:
-        response = requests.post(
-            urljoin(settings.API_URL, "/api/upload_index"), data=file_data
-        )
+        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
 
     # delete local
     shutil.rmtree(full_path)
@@ -196,17 +185,15 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
     self.update_state(state="PROGRESS", meta={"current": 100})
 
     # Proceed with uploading and cleaning as in the original function
-    file_data = {"name": name_job, "user": user, "tokens":tokens}
+    file_data = {"name": name_job, "user": user, "tokens": tokens}
     if settings.VECTOR_STORE == "faiss":
         files = {
             "file_faiss": open(full_path + "/index.faiss", "rb"),
             "file_pkl": open(full_path + "/index.pkl", "rb"),
         }
-        
-        requests.post(
-            urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
-        )
-        requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
+
+        requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
+        requests.get(urljoin(settings.API_URL, "/api/delete_old?name=" + name_job + "&?user=" + user))
     else:
         requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
 
@@ -222,9 +209,7 @@ def count_tokens_docs(docs):
     for doc in docs:
         docs_content += doc.page_content
 
-    tokens, total_price = num_tokens_from_string(
-        string=docs_content, encoding_name="cl100k_base"
-    )
+    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
     # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
     return tokens
 
@@ -234,4 +219,4 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
     encoding = tiktoken.get_encoding(encoding_name)
     num_tokens = len(encoding.encode(string))
     total_price = (num_tokens / 1000) * 0.0004
-    return num_tokens, total_price
\ No newline at end of file
+    return num_tokens, total_price