Merge branch 'main' into 1059-migrating-database-to-new-model

This commit is contained in:
Alex
2024-09-09 23:55:25 +01:00
64 changed files with 3517 additions and 4971 deletions

View File

@@ -2,8 +2,8 @@ import os
import shutil
import string
import zipfile
import tiktoken
from urllib.parse import urljoin
import logging
import requests
from bson.objectid import ObjectId
@@ -14,6 +14,8 @@ from application.parser.remote.remote_creator import RemoteCreator
from application.parser.open_ai_func import call_openai_api
from application.parser.schema.base import Document
from application.parser.token_func import group_split
from application.utils import count_tokens_docs
# Define a function to extract metadata from a given filename.
@@ -40,7 +42,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
max_depth (int): Maximum allowed depth of recursion to prevent infinite loops.
"""
if current_depth > max_depth:
print(f"Reached maximum recursion depth of {max_depth}")
logging.warning(f"Reached maximum recursion depth of {max_depth}")
return
with zipfile.ZipFile(zip_path, "r") as zip_ref:
@@ -88,14 +90,13 @@ def ingest_worker(self, directory, formats, name_job, filename, user, retriever=
max_tokens = 1250
recursion_depth = 2
full_path = os.path.join(directory, user, name_job)
import sys
print(full_path, file=sys.stderr)
logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job})
# check if API_URL env variable is set
file_data = {"name": name_job, "file": filename, "user": user}
response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data)
# check if file is in the response
print(response, file=sys.stderr)
response = requests.get(
urljoin(settings.API_URL, "/api/download"), params=file_data
)
file = response.content
if not os.path.exists(full_path):
@@ -134,7 +135,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user, retriever=
if sample:
for i in range(min(5, len(raw_docs))):
print(raw_docs[i].text)
logging.info(f"Sample document {i}: {raw_docs[i]}")
# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
# and send them to the server (provide user and name in form)
@@ -170,6 +171,7 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp", r
if not os.path.exists(full_path):
os.makedirs(full_path)
self.update_state(state="PROGRESS", meta={"current": 1})
logging.info(f"Remote job: {full_path}", extra={"user": user, "job": name_job, source_data: source_data})
remote_loader = RemoteCreator.create_loader(loader)
raw_docs = remote_loader.load_data(source_data)
@@ -202,23 +204,3 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp", r
shutil.rmtree(full_path)
return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
def count_tokens_docs(docs):
# Here we convert the docs list to a string and calculate the number of tokens the string represents.
# docs_content = (" ".join(docs))
docs_content = ""
for doc in docs:
docs_content += doc.page_content
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
return tokens
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
total_price = (num_tokens / 1000) * 0.0004
return num_tokens, total_price