mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-12-01 01:23:14 +00:00
@@ -1,7 +1,7 @@
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from flask import Blueprint, request, Response
|
||||
from flask import Blueprint, request, Response, current_app
|
||||
import json
|
||||
import datetime
|
||||
import logging
|
||||
@@ -267,6 +267,10 @@ def stream():
|
||||
else:
|
||||
retriever_name = source["active_docs"]
|
||||
|
||||
current_app.logger.info(f"/stream - request_data: {data}, source: {source}",
|
||||
extra={"data": json.dumps({"request_data": data, "source": source})}
|
||||
)
|
||||
|
||||
prompt = get_prompt(prompt_id)
|
||||
|
||||
retriever = RetrieverCreator.create_retriever(
|
||||
@@ -301,7 +305,9 @@ def stream():
|
||||
mimetype="text/event-stream",
|
||||
)
|
||||
except Exception as e:
|
||||
print("\033[91merr", str(e), file=sys.stderr)
|
||||
current_app.logger.error(f"/stream - error: {str(e)} - traceback: {traceback.format_exc()}",
|
||||
extra={"error": str(e), "traceback": traceback.format_exc()}
|
||||
)
|
||||
message = e.args[0]
|
||||
status_code = 400
|
||||
# # Custom exceptions with two arguments, index 1 as status code
|
||||
@@ -345,7 +351,6 @@ def api_answer():
|
||||
else:
|
||||
token_limit = settings.DEFAULT_MAX_HISTORY
|
||||
|
||||
# use try and except to check for exception
|
||||
try:
|
||||
# check if the vectorstore is set
|
||||
if "api_key" in data:
|
||||
@@ -365,6 +370,10 @@ def api_answer():
|
||||
|
||||
prompt = get_prompt(prompt_id)
|
||||
|
||||
current_app.logger.info(f"/api/answer - request_data: {data}, source: {source}",
|
||||
extra={"data": json.dumps({"request_data": data, "source": source})}
|
||||
)
|
||||
|
||||
retriever = RetrieverCreator.create_retriever(
|
||||
retriever_name,
|
||||
question=question,
|
||||
@@ -399,9 +408,9 @@ def api_answer():
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
# print whole traceback
|
||||
traceback.print_exc()
|
||||
print(str(e))
|
||||
current_app.logger.error(f"/api/answer - error: {str(e)} - traceback: {traceback.format_exc()}",
|
||||
extra={"error": str(e), "traceback": traceback.format_exc()}
|
||||
)
|
||||
return bad_request(500, str(e))
|
||||
|
||||
|
||||
@@ -433,6 +442,10 @@ def api_search():
|
||||
token_limit = data["token_limit"]
|
||||
else:
|
||||
token_limit = settings.DEFAULT_MAX_HISTORY
|
||||
|
||||
current_app.logger.info(f"/api/answer - request_data: {data}, source: {source}",
|
||||
extra={"data": json.dumps({"request_data": data, "source": source})}
|
||||
)
|
||||
|
||||
retriever = RetrieverCreator.create_retriever(
|
||||
retriever_name,
|
||||
|
||||
@@ -6,12 +6,14 @@ from application.core.settings import settings
|
||||
from application.api.user.routes import user
|
||||
from application.api.answer.routes import answer
|
||||
from application.api.internal.routes import internal
|
||||
from application.core.logging_config import setup_logging
|
||||
|
||||
if platform.system() == "Windows":
|
||||
import pathlib
|
||||
pathlib.PosixPath = pathlib.WindowsPath
|
||||
|
||||
dotenv.load_dotenv()
|
||||
setup_logging()
|
||||
|
||||
app = Flask(__name__)
|
||||
app.register_blueprint(user)
|
||||
|
||||
@@ -1,9 +1,15 @@
|
||||
from celery import Celery
|
||||
from application.core.settings import settings
|
||||
from celery.signals import setup_logging
|
||||
|
||||
def make_celery(app_name=__name__):
|
||||
celery = Celery(app_name, broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND)
|
||||
celery.conf.update(settings)
|
||||
return celery
|
||||
|
||||
@setup_logging.connect
|
||||
def config_loggers(*args, **kwargs):
|
||||
from application.core.logging_config import setup_logging
|
||||
setup_logging()
|
||||
|
||||
celery = make_celery()
|
||||
|
||||
22
application/core/logging_config.py
Normal file
22
application/core/logging_config.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from logging.config import dictConfig
|
||||
|
||||
def setup_logging():
|
||||
dictConfig({
|
||||
'version': 1,
|
||||
'formatters': {
|
||||
'default': {
|
||||
'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
|
||||
}
|
||||
},
|
||||
"handlers": {
|
||||
"console": {
|
||||
"class": "logging.StreamHandler",
|
||||
"stream": "ext://sys.stdout",
|
||||
"formatter": "default",
|
||||
}
|
||||
},
|
||||
'root': {
|
||||
'level': 'INFO',
|
||||
'handlers': ['console'],
|
||||
},
|
||||
})
|
||||
@@ -5,7 +5,7 @@ from application.parser.remote.base import BaseRemote
|
||||
|
||||
class CrawlerLoader(BaseRemote):
|
||||
def __init__(self, limit=10):
|
||||
from langchain.document_loaders import WebBaseLoader
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
self.loader = WebBaseLoader # Initialize the document loader
|
||||
self.limit = limit # Set the limit for the number of pages to scrape
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ from application.parser.remote.base import BaseRemote
|
||||
|
||||
class SitemapLoader(BaseRemote):
|
||||
def __init__(self, limit=20):
|
||||
from langchain.document_loaders import WebBaseLoader
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
self.loader = WebBaseLoader
|
||||
self.limit = limit # Adding limit to control the number of URLs to process
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ import json
|
||||
from application.retriever.base import BaseRetriever
|
||||
from application.core.settings import settings
|
||||
from application.llm.llm_creator import LLMCreator
|
||||
from application.utils import count_tokens
|
||||
from application.utils import num_tokens_from_string
|
||||
from langchain_community.tools import BraveSearch
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ class BraveRetSearch(BaseRetriever):
|
||||
self.chat_history.reverse()
|
||||
for i in self.chat_history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = count_tokens(i["prompt"]) + count_tokens(
|
||||
tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
|
||||
i["response"]
|
||||
)
|
||||
if tokens_current_history + tokens_batch < self.token_limit:
|
||||
|
||||
@@ -4,7 +4,7 @@ from application.core.settings import settings
|
||||
from application.vectorstore.vector_creator import VectorCreator
|
||||
from application.llm.llm_creator import LLMCreator
|
||||
|
||||
from application.utils import count_tokens
|
||||
from application.utils import num_tokens_from_string
|
||||
|
||||
|
||||
class ClassicRAG(BaseRetriever):
|
||||
@@ -98,7 +98,7 @@ class ClassicRAG(BaseRetriever):
|
||||
self.chat_history.reverse()
|
||||
for i in self.chat_history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = count_tokens(i["prompt"]) + count_tokens(
|
||||
tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
|
||||
i["response"]
|
||||
)
|
||||
if tokens_current_history + tokens_batch < self.token_limit:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from application.retriever.base import BaseRetriever
|
||||
from application.core.settings import settings
|
||||
from application.llm.llm_creator import LLMCreator
|
||||
from application.utils import count_tokens
|
||||
from application.utils import num_tokens_from_string
|
||||
from langchain_community.tools import DuckDuckGoSearchResults
|
||||
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
|
||||
|
||||
@@ -95,7 +95,7 @@ class DuckDuckSearch(BaseRetriever):
|
||||
self.chat_history.reverse()
|
||||
for i in self.chat_history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = count_tokens(i["prompt"]) + count_tokens(
|
||||
tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
|
||||
i["response"]
|
||||
)
|
||||
if tokens_current_history + tokens_batch < self.token_limit:
|
||||
|
||||
@@ -2,7 +2,7 @@ import sys
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
from application.core.settings import settings
|
||||
from application.utils import count_tokens
|
||||
from application.utils import num_tokens_from_string
|
||||
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
db = mongo["docsgpt"]
|
||||
@@ -24,9 +24,9 @@ def update_token_usage(user_api_key, token_usage):
|
||||
def gen_token_usage(func):
|
||||
def wrapper(self, model, messages, stream, **kwargs):
|
||||
for message in messages:
|
||||
self.token_usage["prompt_tokens"] += count_tokens(message["content"])
|
||||
self.token_usage["prompt_tokens"] += num_tokens_from_string(message["content"])
|
||||
result = func(self, model, messages, stream, **kwargs)
|
||||
self.token_usage["generated_tokens"] += count_tokens(result)
|
||||
self.token_usage["generated_tokens"] += num_tokens_from_string(result)
|
||||
update_token_usage(self.user_api_key, self.token_usage)
|
||||
return result
|
||||
|
||||
@@ -36,14 +36,14 @@ def gen_token_usage(func):
|
||||
def stream_token_usage(func):
|
||||
def wrapper(self, model, messages, stream, **kwargs):
|
||||
for message in messages:
|
||||
self.token_usage["prompt_tokens"] += count_tokens(message["content"])
|
||||
self.token_usage["prompt_tokens"] += num_tokens_from_string(message["content"])
|
||||
batch = []
|
||||
result = func(self, model, messages, stream, **kwargs)
|
||||
for r in result:
|
||||
batch.append(r)
|
||||
yield r
|
||||
for line in batch:
|
||||
self.token_usage["generated_tokens"] += count_tokens(line)
|
||||
self.token_usage["generated_tokens"] += num_tokens_from_string(line)
|
||||
update_token_usage(self.user_api_key, self.token_usage)
|
||||
|
||||
return wrapper
|
||||
|
||||
@@ -1,6 +1,22 @@
|
||||
from transformers import GPT2TokenizerFast
|
||||
import tiktoken
|
||||
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
|
||||
tokenizer.model_max_length = 100000
|
||||
def count_tokens(string):
|
||||
return len(tokenizer(string)['input_ids'])
|
||||
_encoding = None
|
||||
|
||||
def get_encoding():
|
||||
global _encoding
|
||||
if _encoding is None:
|
||||
_encoding = tiktoken.get_encoding("cl100k_base")
|
||||
return _encoding
|
||||
|
||||
def num_tokens_from_string(string: str) -> int:
|
||||
encoding = get_encoding()
|
||||
num_tokens = len(encoding.encode(string))
|
||||
return num_tokens
|
||||
|
||||
def count_tokens_docs(docs):
|
||||
docs_content = ""
|
||||
for doc in docs:
|
||||
docs_content += doc.page_content
|
||||
|
||||
tokens = num_tokens_from_string(docs_content)
|
||||
return tokens
|
||||
@@ -2,8 +2,8 @@ import os
|
||||
import shutil
|
||||
import string
|
||||
import zipfile
|
||||
import tiktoken
|
||||
from urllib.parse import urljoin
|
||||
import logging
|
||||
|
||||
import requests
|
||||
|
||||
@@ -13,6 +13,8 @@ from application.parser.remote.remote_creator import RemoteCreator
|
||||
from application.parser.open_ai_func import call_openai_api
|
||||
from application.parser.schema.base import Document
|
||||
from application.parser.token_func import group_split
|
||||
from application.utils import count_tokens_docs
|
||||
|
||||
|
||||
# Define a function to extract metadata from a given filename.
|
||||
def metadata_from_filename(title):
|
||||
@@ -41,7 +43,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
|
||||
max_depth (int): Maximum allowed depth of recursion to prevent infinite loops.
|
||||
"""
|
||||
if current_depth > max_depth:
|
||||
print(f"Reached maximum recursion depth of {max_depth}")
|
||||
logging.warning(f"Reached maximum recursion depth of {max_depth}")
|
||||
return
|
||||
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
@@ -88,16 +90,13 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
max_tokens = 1250
|
||||
recursion_depth = 2
|
||||
full_path = os.path.join(directory, user, name_job)
|
||||
import sys
|
||||
|
||||
print(full_path, file=sys.stderr)
|
||||
logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job})
|
||||
# check if API_URL env variable is set
|
||||
file_data = {"name": name_job, "file": filename, "user": user}
|
||||
response = requests.get(
|
||||
urljoin(settings.API_URL, "/api/download"), params=file_data
|
||||
)
|
||||
# check if file is in the response
|
||||
print(response, file=sys.stderr)
|
||||
file = response.content
|
||||
|
||||
if not os.path.exists(full_path):
|
||||
@@ -137,7 +136,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
|
||||
if sample:
|
||||
for i in range(min(5, len(raw_docs))):
|
||||
print(raw_docs[i].text)
|
||||
logging.info(f"Sample document {i}: {raw_docs[i]}")
|
||||
|
||||
# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
|
||||
# and send them to the server (provide user and name in form)
|
||||
@@ -180,6 +179,7 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
|
||||
if not os.path.exists(full_path):
|
||||
os.makedirs(full_path)
|
||||
self.update_state(state="PROGRESS", meta={"current": 1})
|
||||
logging.info(f"Remote job: {full_path}", extra={"user": user, "job": name_job, source_data: source_data})
|
||||
|
||||
remote_loader = RemoteCreator.create_loader(loader)
|
||||
raw_docs = remote_loader.load_data(source_data)
|
||||
@@ -212,26 +212,4 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
|
||||
|
||||
shutil.rmtree(full_path)
|
||||
|
||||
return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
|
||||
|
||||
|
||||
def count_tokens_docs(docs):
|
||||
# Here we convert the docs list to a string and calculate the number of tokens the string represents.
|
||||
# docs_content = (" ".join(docs))
|
||||
docs_content = ""
|
||||
for doc in docs:
|
||||
docs_content += doc.page_content
|
||||
|
||||
tokens, total_price = num_tokens_from_string(
|
||||
string=docs_content, encoding_name="cl100k_base"
|
||||
)
|
||||
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
|
||||
return tokens
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
total_price = (num_tokens / 1000) * 0.0004
|
||||
return num_tokens, total_price
|
||||
return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
|
||||
Reference in New Issue
Block a user