diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py old mode 100644 new mode 100755 index e8a1b80b..6039ecdf --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -34,6 +34,7 @@ def upload_index_files(): if "name" not in request.form: return {"status": "no name"} job_name = secure_filename(request.form["name"]) + tokens = secure_filename(request.form["tokens"]) save_dir = os.path.join(current_dir, "indexes", user, job_name) if settings.VECTOR_STORE == "faiss": if "file_faiss" not in request.files: @@ -64,6 +65,7 @@ def upload_index_files(): "date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), "model": settings.EMBEDDINGS_NAME, "type": "local", + "tokens": tokens } ) return {"status": "ok"} \ No newline at end of file diff --git a/application/api/user/routes.py b/application/api/user/routes.py index f61b4ad4..51101492 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -253,6 +253,7 @@ def combined_json(): "docLink": "default", "model": settings.EMBEDDINGS_NAME, "location": "remote", + "tokens":"" } ] # structure: name, language, version, description, fullName, date, docLink @@ -269,6 +270,7 @@ def combined_json(): "docLink": index["location"], "model": settings.EMBEDDINGS_NAME, "location": "local", + "tokens" : index["tokens"] if ("tokens" in index.keys()) else "" } ) if settings.VECTOR_STORE == "faiss": @@ -290,6 +292,7 @@ def combined_json(): "docLink": "duckduck_search", "model": settings.EMBEDDINGS_NAME, "location": "custom", + "tokens":"" } ) if "brave_search" in settings.RETRIEVERS_ENABLED: @@ -304,6 +307,7 @@ def combined_json(): "docLink": "brave_search", "model": settings.EMBEDDINGS_NAME, "location": "custom", + "tokens":"" } ) diff --git a/application/parser/open_ai_func.py b/application/parser/open_ai_func.py old mode 100644 new mode 100755 index 6a67c538..c58e8059 --- a/application/parser/open_ai_func.py +++ b/application/parser/open_ai_func.py @@ -1,6 +1,5 @@ import os -import tiktoken from application.vectorstore.vector_creator import VectorCreator from application.core.settings import settings from retry import retry @@ -11,14 +10,6 @@ from retry import retry # from langchain_community.embeddings import CohereEmbeddings -def num_tokens_from_string(string: str, encoding_name: str) -> int: - # Function to convert string to tokens and estimate user cost. - encoding = tiktoken.get_encoding(encoding_name) - num_tokens = len(encoding.encode(string)) - total_price = (num_tokens / 1000) * 0.0004 - return num_tokens, total_price - - @retry(tries=10, delay=60) def store_add_texts_with_retry(store, i): store.add_texts([i.page_content], metadatas=[i.metadata]) @@ -79,25 +70,3 @@ def call_openai_api(docs, folder_name, task_status): store.save_local(f"{folder_name}") -def get_user_permission(docs, folder_name): - # Function to ask user permission to call the OpenAI api and spend their OpenAI funds. - # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. - # docs_content = (" ".join(docs)) - docs_content = "" - for doc in docs: - docs_content += doc.page_content - - tokens, total_price = num_tokens_from_string( - string=docs_content, encoding_name="cl100k_base" - ) - # Here we print the number of tokens and the approx user cost with some visually appealing formatting. - print(f"Number of Tokens = {format(tokens, ',d')}") - print(f"Approx Cost = ${format(total_price, ',.2f')}") - # Here we check for user permission before calling the API. - user_input = input("Price Okay? (Y/N) \n").lower() - if user_input == "y": - call_openai_api(docs, folder_name) - elif user_input == "": - call_openai_api(docs, folder_name) - else: - print("The API was not called. No money was spent.") diff --git a/application/worker.py b/application/worker.py old mode 100644 new mode 100755 index 8b09649f..bd1bc15a --- a/application/worker.py +++ b/application/worker.py @@ -2,6 +2,7 @@ import os import shutil import string import zipfile +import tiktoken from urllib.parse import urljoin import requests @@ -131,6 +132,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] call_openai_api(docs, full_path, self) + tokens = count_tokens_docs(docs) self.update_state(state="PROGRESS", meta={"current": 100}) if sample: @@ -139,7 +141,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl # and send them to the server (provide user and name in form) - file_data = {"name": name_job, "user": user} + file_data = {"name": name_job, "user": user, "tokens":tokens} if settings.VECTOR_STORE == "faiss": files = { "file_faiss": open(full_path + "/index.faiss", "rb"), @@ -188,18 +190,19 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"): max_tokens=max_tokens, token_check=token_check, ) - # docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] call_openai_api(docs, full_path, self) + tokens = count_tokens_docs(docs) self.update_state(state="PROGRESS", meta={"current": 100}) # Proceed with uploading and cleaning as in the original function - file_data = {"name": name_job, "user": user} + file_data = {"name": name_job, "user": user, "tokens":tokens} if settings.VECTOR_STORE == "faiss": files = { "file_faiss": open(full_path + "/index.faiss", "rb"), "file_pkl": open(full_path + "/index.pkl", "rb"), } + requests.post( urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data ) @@ -210,3 +213,25 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"): shutil.rmtree(full_path) return {"urls": source_data, "name_job": name_job, "user": user, "limited": False} + + +def count_tokens_docs(docs): + # Here we convert the docs list to a string and calculate the number of tokens the string represents. + # docs_content = (" ".join(docs)) + docs_content = "" + for doc in docs: + docs_content += doc.page_content + + tokens, total_price = num_tokens_from_string( + string=docs_content, encoding_name="cl100k_base" + ) + # Here we print the number of tokens and the approx user cost with some visually appealing formatting. + return tokens + + +def num_tokens_from_string(string: str, encoding_name: str) -> int: + # Function to convert string to tokens and estimate user cost. + encoding = tiktoken.get_encoding(encoding_name) + num_tokens = len(encoding.encode(string)) + total_price = (num_tokens / 1000) * 0.0004 + return num_tokens, total_price \ No newline at end of file diff --git a/frontend/src/models/misc.ts b/frontend/src/models/misc.ts index ca7694b9..52787932 100644 --- a/frontend/src/models/misc.ts +++ b/frontend/src/models/misc.ts @@ -13,6 +13,7 @@ export type Doc = { date: string; docLink: string; model: string; + tokens?: string; }; export type PromptProps = { diff --git a/frontend/src/settings/Documents.tsx b/frontend/src/settings/Documents.tsx index 18ae687c..c7e7e834 100644 --- a/frontend/src/settings/Documents.tsx +++ b/frontend/src/settings/Documents.tsx @@ -14,6 +14,7 @@ const Documents: React.FC = ({ Document Name Vector Date + Token usage Type @@ -28,6 +29,9 @@ const Documents: React.FC = ({ {document.date} + + {document.tokens ? document.tokens : ''} + {document.location === 'remote' ? 'Pre-loaded'