From 1eb168be55fdcd594c2548f9aa45c4617f778fd8 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Sun, 11 Aug 2024 19:33:31 +0530 Subject: [PATCH] vector indexes to be named after mongo _id --- application/api/internal/routes.py | 10 +++++-- application/api/user/routes.py | 23 +++++++-------- application/retriever/classic_rag.py | 9 +----- application/worker.py | 43 +++++++++------------------- 4 files changed, 34 insertions(+), 51 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 6039ecdf..f4203822 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -3,7 +3,7 @@ import datetime from flask import Blueprint, request, send_from_directory from pymongo import MongoClient from werkzeug.utils import secure_filename - +from bson.objectid import ObjectId from application.core.settings import settings mongo = MongoClient(settings.MONGO_URI) @@ -35,7 +35,12 @@ def upload_index_files(): return {"status": "no name"} job_name = secure_filename(request.form["name"]) tokens = secure_filename(request.form["tokens"]) - save_dir = os.path.join(current_dir, "indexes", user, job_name) + """" + ObjectId serves as a dir name in application/indexes, + and for indexing the vector metadata in the collection + """ + _id = ObjectId() + save_dir = os.path.join(current_dir, "indexes", str(_id)) if settings.VECTOR_STORE == "faiss": if "file_faiss" not in request.files: print("No file part") @@ -58,6 +63,7 @@ def upload_index_files(): # create entry in vectors_collection vectors_collection.insert_one( { + "_id":_id, "user": user, "name": job_name, "language": job_name, diff --git a/application/api/user/routes.py b/application/api/user/routes.py index aab30469..7ce0b2e2 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -116,18 +116,17 @@ def delete_by_ids(): def delete_old(): """Delete old indexes.""" import shutil - - path = request.args.get("path") - dirs = path.split("/") - dirs_clean = [] - for i in range(0, len(dirs)): - dirs_clean.append(secure_filename(dirs[i])) - # check that path strats with indexes or vectors - - if dirs_clean[0] not in ["indexes", "vectors"]: - return {"status": "error"} - path_clean = "/".join(dirs_clean) - vectors_collection.delete_one({"name": dirs_clean[-1], "user": dirs_clean[-2]}) + name = request.args.get("name") + user = request.args.get("user") + doc = vectors_collection.find_one({ + "user":user, + "name":name + }) + print("user",user) + print("file",name) + if(doc is None): + return {"status":"not found"},404 + path_clean = doc["location"] if settings.VECTOR_STORE == "faiss": try: shutil.rmtree(os.path.join(current_dir, path_clean)) diff --git a/application/retriever/classic_rag.py b/application/retriever/classic_rag.py index 2b77db34..4a1aa5bc 100644 --- a/application/retriever/classic_rag.py +++ b/application/retriever/classic_rag.py @@ -40,14 +40,7 @@ class ClassicRAG(BaseRetriever): def _get_vectorstore(self, source): if "active_docs" in source: - if source["active_docs"].split("/")[0] == "default": - vectorstore = "" - elif source["active_docs"].split("/")[0] == "local": - vectorstore = "indexes/" + source["active_docs"] - else: - vectorstore = "vectors/" + source["active_docs"] - if source["active_docs"] == "default": - vectorstore = "" + vectorstore = "indexes/"+source["active_docs"] else: vectorstore = "" vectorstore = os.path.join("application", vectorstore) diff --git a/application/worker.py b/application/worker.py index bd1bc15a..b3258983 100755 --- a/application/worker.py +++ b/application/worker.py @@ -14,6 +14,7 @@ from application.parser.open_ai_func import call_openai_api from application.parser.schema.base import Document from application.parser.token_func import group_split + # Define a function to extract metadata from a given filename. def metadata_from_filename(title): store = "/".join(title.split("/")[1:3]) @@ -25,9 +26,7 @@ def generate_random_string(length): return "".join([string.ascii_letters[i % 52] for i in range(length)]) -current_dir = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -) +current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5): @@ -93,9 +92,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): print(full_path, file=sys.stderr) # check if API_URL env variable is set file_data = {"name": name_job, "file": filename, "user": user} - response = requests.get( - urljoin(settings.API_URL, "/api/download"), params=file_data - ) + response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data) # check if file is in the response print(response, file=sys.stderr) file = response.content @@ -107,9 +104,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # check if file is .zip and extract it if filename.endswith(".zip"): - extract_zip_recursive( - os.path.join(full_path, filename), full_path, 0, recursion_depth - ) + extract_zip_recursive(os.path.join(full_path, filename), full_path, 0, recursion_depth) self.update_state(state="PROGRESS", meta={"current": 1}) @@ -141,22 +136,16 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl # and send them to the server (provide user and name in form) - file_data = {"name": name_job, "user": user, "tokens":tokens} + file_data = {"name": name_job, "user": user, "tokens": tokens} if settings.VECTOR_STORE == "faiss": files = { "file_faiss": open(full_path + "/index.faiss", "rb"), "file_pkl": open(full_path + "/index.pkl", "rb"), } - response = requests.post( - urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data - ) - response = requests.get( - urljoin(settings.API_URL, "/api/delete_old?path=" + full_path) - ) + response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) + response = requests.get(urljoin(settings.API_URL, "/api/delete_old?name=" + name_job + "&?user=" + user)) else: - response = requests.post( - urljoin(settings.API_URL, "/api/upload_index"), data=file_data - ) + response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) # delete local shutil.rmtree(full_path) @@ -196,17 +185,15 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"): self.update_state(state="PROGRESS", meta={"current": 100}) # Proceed with uploading and cleaning as in the original function - file_data = {"name": name_job, "user": user, "tokens":tokens} + file_data = {"name": name_job, "user": user, "tokens": tokens} if settings.VECTOR_STORE == "faiss": files = { "file_faiss": open(full_path + "/index.faiss", "rb"), "file_pkl": open(full_path + "/index.pkl", "rb"), } - - requests.post( - urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data - ) - requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)) + + requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) + requests.get(urljoin(settings.API_URL, "/api/delete_old?name=" + name_job + "&?user=" + user)) else: requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) @@ -222,9 +209,7 @@ def count_tokens_docs(docs): for doc in docs: docs_content += doc.page_content - tokens, total_price = num_tokens_from_string( - string=docs_content, encoding_name="cl100k_base" - ) + tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base") # Here we print the number of tokens and the approx user cost with some visually appealing formatting. return tokens @@ -234,4 +219,4 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int: encoding = tiktoken.get_encoding(encoding_name) num_tokens = len(encoding.encode(string)) total_price = (num_tokens / 1000) * 0.0004 - return num_tokens, total_price \ No newline at end of file + return num_tokens, total_price