Merge branch 'main' into 1059-migrating-database-to-new-model

2026-02-20 19:31:10 +00:00 · 2024-09-09 23:55:25 +01:00
parent c686d950d0 a1d3592d08
commit 44d225e6ca
64 changed files with 3517 additions and 4971 deletions
--- a/application/worker.py
+++ b/application/worker.py
@@ -2,8 +2,8 @@ import os
 import shutil
 import string
 import zipfile
-import tiktoken
 from urllib.parse import urljoin
+import logging

 import requests
 from bson.objectid import ObjectId
@@ -14,6 +14,8 @@ from application.parser.remote.remote_creator import RemoteCreator
 from application.parser.open_ai_func import call_openai_api
 from application.parser.schema.base import Document
 from application.parser.token_func import group_split
+from application.utils import count_tokens_docs
+


 # Define a function to extract metadata from a given filename.
@@ -40,7 +42,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
        max_depth (int): Maximum allowed depth of recursion to prevent infinite loops.
    """
    if current_depth > max_depth:
-        print(f"Reached maximum recursion depth of {max_depth}")
+        logging.warning(f"Reached maximum recursion depth of {max_depth}")
        return

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
@@ -88,14 +90,13 @@ def ingest_worker(self, directory, formats, name_job, filename, user, retriever=
    max_tokens = 1250
    recursion_depth = 2
    full_path = os.path.join(directory, user, name_job)
-    import sys

-    print(full_path, file=sys.stderr)
+    logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job})
    # check if API_URL env variable is set
    file_data = {"name": name_job, "file": filename, "user": user}
-    response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data)
-    # check if file is in the response
-    print(response, file=sys.stderr)
+    response = requests.get(
+        urljoin(settings.API_URL, "/api/download"), params=file_data
+    )
    file = response.content

    if not os.path.exists(full_path):
@@ -134,7 +135,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user, retriever=

    if sample:
        for i in range(min(5, len(raw_docs))):
-            print(raw_docs[i].text)
+            logging.info(f"Sample document {i}: {raw_docs[i]}")

    # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
    # and send them to the server (provide user and name in form)
@@ -170,6 +171,7 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp", r
    if not os.path.exists(full_path):
        os.makedirs(full_path)
    self.update_state(state="PROGRESS", meta={"current": 1})
+    logging.info(f"Remote job: {full_path}", extra={"user": user, "job": name_job, source_data: source_data})

    remote_loader = RemoteCreator.create_loader(loader)
    raw_docs = remote_loader.load_data(source_data)
@@ -202,23 +204,3 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp", r
    shutil.rmtree(full_path)

    return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
-
-
-def count_tokens_docs(docs):
-    # Here we convert the docs list to a string and calculate the number of tokens the string represents.
-    # docs_content = (" ".join(docs))
-    docs_content = ""
-    for doc in docs:
-        docs_content += doc.page_content
-
-    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
-    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
-    return tokens
-
-
-def num_tokens_from_string(string: str, encoding_name: str) -> int:
-    # Function to convert string to tokens and estimate user cost.
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    total_price = (num_tokens / 1000) * 0.0004
-    return num_tokens, total_price