vector indexes to be named after mongo _id

2026-05-03 23:36:38 +00:00 · 2024-08-11 19:33:31 +05:30
parent 3c6fd365fb
commit 1eb168be55
4 changed files with 34 additions and 51 deletions
--- a/application/worker.py
+++ b/application/worker.py
@@ -14,6 +14,7 @@ from application.parser.open_ai_func import call_openai_api
 from application.parser.schema.base import Document
 from application.parser.token_func import group_split

+
 # Define a function to extract metadata from a given filename.
 def metadata_from_filename(title):
    store = "/".join(title.split("/")[1:3])
@@ -25,9 +26,7 @@ def generate_random_string(length):
    return "".join([string.ascii_letters[i % 52] for i in range(length)])


-current_dir = os.path.dirname(
-    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-)
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


 def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
@@ -93,9 +92,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
    print(full_path, file=sys.stderr)
    # check if API_URL env variable is set
    file_data = {"name": name_job, "file": filename, "user": user}
-    response = requests.get(
-        urljoin(settings.API_URL, "/api/download"), params=file_data
-    )
+    response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data)
    # check if file is in the response
    print(response, file=sys.stderr)
    file = response.content
@@ -107,9 +104,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):

    # check if file is .zip and extract it
    if filename.endswith(".zip"):
-        extract_zip_recursive(
-            os.path.join(full_path, filename), full_path, 0, recursion_depth
-        )
+        extract_zip_recursive(os.path.join(full_path, filename), full_path, 0, recursion_depth)

    self.update_state(state="PROGRESS", meta={"current": 1})

@@ -141,22 +136,16 @@ def ingest_worker(self, directory, formats, name_job, filename, user):

    # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
    # and send them to the server (provide user and name in form)
-    file_data = {"name": name_job, "user": user, "tokens":tokens}
+    file_data = {"name": name_job, "user": user, "tokens": tokens}
    if settings.VECTOR_STORE == "faiss":
        files = {
            "file_faiss": open(full_path + "/index.faiss", "rb"),
            "file_pkl": open(full_path + "/index.pkl", "rb"),
        }
-        response = requests.post(
-            urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
-        )
-        response = requests.get(
-            urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)
-        )
+        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
+        response = requests.get(urljoin(settings.API_URL, "/api/delete_old?name=" + name_job + "&?user=" + user))
    else:
-        response = requests.post(
-            urljoin(settings.API_URL, "/api/upload_index"), data=file_data
-        )
+        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)

    # delete local
    shutil.rmtree(full_path)
@@ -196,17 +185,15 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
    self.update_state(state="PROGRESS", meta={"current": 100})

    # Proceed with uploading and cleaning as in the original function
-    file_data = {"name": name_job, "user": user, "tokens":tokens}
+    file_data = {"name": name_job, "user": user, "tokens": tokens}
    if settings.VECTOR_STORE == "faiss":
        files = {
            "file_faiss": open(full_path + "/index.faiss", "rb"),
            "file_pkl": open(full_path + "/index.pkl", "rb"),
        }
-        
-        requests.post(
-            urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
-        )
-        requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
+
+        requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
+        requests.get(urljoin(settings.API_URL, "/api/delete_old?name=" + name_job + "&?user=" + user))
    else:
        requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)

@@ -222,9 +209,7 @@ def count_tokens_docs(docs):
    for doc in docs:
        docs_content += doc.page_content

-    tokens, total_price = num_tokens_from_string(
-        string=docs_content, encoding_name="cl100k_base"
-    )
+    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
    return tokens

@@ -234,4 +219,4 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    total_price = (num_tokens / 1000) * 0.0004
-    return num_tokens, total_price
+    return num_tokens, total_price