Merge branch 'main' into Jackson

2026-02-21 20:01:26 +00:00 · 2024-09-05 23:43:17 +01:00
parent d232229abf 490e58fb52
commit 2ff8c0b128
66 changed files with 3760 additions and 5225 deletions
--- a/application/Dockerfile
+++ b/application/Dockerfile
@@ -4,14 +4,11 @@ FROM ubuntu:24.04 as builder
 ENV DEBIAN_FRONTEND=noninteractive

 RUN apt-get update && \
-    apt-get install -y software-properties-common
-
-RUN add-apt-repository ppa:deadsnakes/ppa
-
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
 # Install necessary packages and Python
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends gcc curl wget unzip libc6-dev python3.11 python3.11-distutils python3.11-venv && \
-    apt-get clean && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends gcc wget unzip libc6-dev python3.11 python3.11-distutils python3.11-venv && \
    rm -rf /var/lib/apt/lists/* 

 # Verify Python installation and setup symlink
@@ -27,7 +24,7 @@ RUN wget https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.z
    rm mpnet-base-v2.zip

 # Install Rust
-RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
+RUN wget -q -O - https://sh.rustup.rs | sh -s -- -y

 # Clean up to reduce container size
 RUN apt-get remove --purge -y wget unzip && apt-get autoremove -y && rm -rf /var/lib/apt/lists/*
@@ -50,12 +47,10 @@ RUN pip install --no-cache-dir --upgrade pip && \
 FROM ubuntu:24.04 as final

 RUN apt-get update && \
-    apt-get install -y software-properties-common
-
-RUN add-apt-repository ppa:deadsnakes/ppa
-
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
 # Install Python
-RUN apt-get update && apt-get install -y --no-install-recommends python3.11 && \
+    apt-get update && apt-get install -y --no-install-recommends python3.11 && \
    ln -s /usr/bin/python3.11 /usr/bin/python && \
    rm -rf /var/lib/apt/lists/*

--- a/application/api/answer/routes.py
+++ b/application/api/answer/routes.py
@@ -1,7 +1,7 @@
 import asyncio
 import os
 import sys
-from flask import Blueprint, request, Response
+from flask import Blueprint, request, Response, current_app
 import json
 import datetime
 import logging
@@ -74,7 +74,7 @@ def run_async_chain(chain, question, chat_history):

 def get_data_from_api_key(api_key):
    data = api_key_collection.find_one({"key": api_key})
-    
+
    # # Raise custom exception if the API key is not found
    if data is None:
        raise Exception("Invalid API Key, please generate new key", 401)
@@ -129,10 +129,10 @@ def save_conversation(conversation_id, question, response, source_log_docs, llm)
                "content": "Summarise following conversation in no more than 3 "
                "words, respond ONLY with the summary, use the same "
                "language as the system \n\nUser: "
-                +question
-                +"\n\n"
-                +"AI: "
-                +response,
+                + question
+                + "\n\n"
+                + "AI: "
+                + response,
            },
            {
                "role": "user",
@@ -172,7 +172,9 @@ def get_prompt(prompt_id):
    return prompt


-def complete_stream(question, retriever, conversation_id, user_api_key):
+def complete_stream(
+    question, retriever, conversation_id, user_api_key, isNoneDoc=False
+):

    try:
        response_full = ""
@@ -186,126 +188,142 @@ def complete_stream(question, retriever, conversation_id, user_api_key):
            elif "source" in line:
                source_log_docs.append(line["source"])

+        if isNoneDoc:
+            for doc in source_log_docs:
+                doc["source"] = "None"
+
        llm = LLMCreator.create_llm(
            settings.LLM_NAME, api_key=settings.API_KEY, user_api_key=user_api_key
-            )
-        if(user_api_key is None):
+        )
+        if user_api_key is None:
            conversation_id = save_conversation(
                conversation_id, question, response_full, source_log_docs, llm
            )
            # send data.type = "end" to indicate that the stream has ended as json
            data = json.dumps({"type": "id", "id": str(conversation_id)})
            yield f"data: {data}\n\n"
-            
+
        data = json.dumps({"type": "end"})
        yield f"data: {data}\n\n"
    except Exception as e:
        print("\033[91merr", str(e), file=sys.stderr)
-        data = json.dumps({"type": "error","error":"Please try again later. We apologize for any inconvenience.",
-          "error_exception": str(e)})
+        data = json.dumps(
+            {
+                "type": "error",
+                "error": "Please try again later. We apologize for any inconvenience.",
+                "error_exception": str(e),
+            }
+        )
        yield f"data: {data}\n\n"
-        return 
+        return
+

@answer.route("/stream", methods=["POST"])
 def stream():
-   try:
-    data = request.get_json()
-    # get parameter from url question
-    question = data["question"]
-    if "history" not in data:
-        history = []
-    else:
-        history = data["history"]
-        history = json.loads(history)
-    if "conversation_id" not in data:
-        conversation_id = None
-    else:
-        conversation_id = data["conversation_id"]
-    if "prompt_id" in data:
-        prompt_id = data["prompt_id"]
-    else:
-        prompt_id = "default"
-    if "selectedDocs" in data and data["selectedDocs"] is None:
-        chunks = 0
-    elif "chunks" in data:
-        chunks = int(data["chunks"])
-    else:
-        chunks = 2
-    if "token_limit" in data:
-        token_limit = data["token_limit"]
-    else:
-        token_limit = settings.DEFAULT_MAX_HISTORY
+    try:
+        data = request.get_json()
+        question = data["question"]
+        if "history" not in data:
+            history = []
+        else:
+            history = data["history"]
+            history = json.loads(history)
+        if "conversation_id" not in data:
+            conversation_id = None
+        else:
+            conversation_id = data["conversation_id"]
+        if "prompt_id" in data:
+            prompt_id = data["prompt_id"]
+        else:
+            prompt_id = "default"
+        if "selectedDocs" in data and data["selectedDocs"] is None:
+            chunks = 0
+        elif "chunks" in data:
+            chunks = int(data["chunks"])
+        else:
+            chunks = 2
+        if "token_limit" in data:
+            token_limit = data["token_limit"]
+        else:
+            token_limit = settings.DEFAULT_MAX_HISTORY

-    # check if active_docs or api_key is set
+        # check if active_docs or api_key is set

-    if "api_key" in data:
-        data_key = get_data_from_api_key(data["api_key"])
-        chunks = int(data_key["chunks"])
-        prompt_id = data_key["prompt_id"]
-        source = {"active_docs": data_key["source"]}
-        user_api_key = data["api_key"]
-    elif "active_docs" in data:
-        source = {"active_docs": data["active_docs"]}
-        user_api_key = None
-    else:
-        source = {}
-        user_api_key = None
+        if "api_key" in data:
+            data_key = get_data_from_api_key(data["api_key"])
+            chunks = int(data_key["chunks"])
+            prompt_id = data_key["prompt_id"]
+            source = {"active_docs": data_key["source"]}
+            user_api_key = data["api_key"]
+        elif "active_docs" in data:
+            source = {"active_docs": data["active_docs"]}
+            user_api_key = None
+        else:
+            source = {}
+            user_api_key = None

-    if (
-        source["active_docs"].split("/")[0] == "default"
-        or source["active_docs"].split("/")[0] == "local"
-    ):
-        retriever_name = "classic"
-    else:
-        retriever_name = source["active_docs"]
+        if source["active_docs"].split("/")[0] in ["default", "local"]:
+            retriever_name = "classic"
+        else:
+            retriever_name = source["active_docs"]

-    prompt = get_prompt(prompt_id)
+        current_app.logger.info(f"/stream - request_data: {data}, source: {source}",
+            extra={"data": json.dumps({"request_data": data, "source": source})}
+        )

-    retriever = RetrieverCreator.create_retriever(
-        retriever_name,
-        question=question,
-        source=source,
-        chat_history=history,
-        prompt=prompt,
-        chunks=chunks,
-        token_limit=token_limit,
-        gpt_model=gpt_model,
-        user_api_key=user_api_key,
-    )
+        prompt = get_prompt(prompt_id)

-    return Response(
-        complete_stream(
+        retriever = RetrieverCreator.create_retriever(
+            retriever_name,
            question=question,
-            retriever=retriever,
-            conversation_id=conversation_id,
+            source=source,
+            chat_history=history,
+            prompt=prompt,
+            chunks=chunks,
+            token_limit=token_limit,
+            gpt_model=gpt_model,
            user_api_key=user_api_key,
-        ),
-        mimetype="text/event-stream",
-    )
-    
-   except ValueError:
-       message = "Malformed request body"
-       print("\033[91merr", str(message), file=sys.stderr)
-       return Response(
-        error_stream_generate(message),
-        status=400,
-        mimetype="text/event-stream",
-    )
-   except Exception as e:
-        print("\033[91merr", str(e), file=sys.stderr)
+        )
+
+        return Response(
+            complete_stream(
+                question=question,
+                retriever=retriever,
+                conversation_id=conversation_id,
+                user_api_key=user_api_key,
+                isNoneDoc=data.get("isNoneDoc"),
+            ),
+            mimetype="text/event-stream",
+        )
+
+    except ValueError:
+        message = "Malformed request body"
+        print("\033[91merr", str(message), file=sys.stderr)
+        return Response(
+            error_stream_generate(message),
+            status=400,
+            mimetype="text/event-stream",
+        )
+    except Exception as e:
+        current_app.logger.error(f"/stream - error: {str(e)} - traceback: {traceback.format_exc()}",
+          extra={"error": str(e), "traceback": traceback.format_exc()}
+        )
        message = e.args[0]
        status_code = 400
        # # Custom exceptions with two arguments, index 1 as status code
-        if(len(e.args) >= 2):
+        if len(e.args) >= 2:
            status_code = e.args[1]
        return Response(
-        error_stream_generate(message),
-        status=status_code,
-        mimetype="text/event-stream",
-    )
+            error_stream_generate(message),
+            status=status_code,
+            mimetype="text/event-stream",
+        )
+
+
 def error_stream_generate(err_response):
-            data = json.dumps({"type": "error", "error":err_response})
-            yield f"data: {data}\n\n"
+    data = json.dumps({"type": "error", "error": err_response})
+    yield f"data: {data}\n\n"
+

@answer.route("/api/answer", methods=["POST"])
 def api_answer():
@@ -333,7 +351,6 @@ def api_answer():
    else:
        token_limit = settings.DEFAULT_MAX_HISTORY

-    # use try and except  to check for exception
    try:
        # check if the vectorstore is set
        if "api_key" in data:
@@ -346,16 +363,17 @@ def api_answer():
            source = data
            user_api_key = None

-        if (
-            source["active_docs"].split("/")[0] == "default"
-            or source["active_docs"].split("/")[0] == "local"
-        ):
+        if source["active_docs"].split("/")[0] in ["default", "local"]:
            retriever_name = "classic"
        else:
            retriever_name = source["active_docs"]

        prompt = get_prompt(prompt_id)

+        current_app.logger.info(f"/api/answer - request_data: {data}, source: {source}",
+            extra={"data": json.dumps({"request_data": data, "source": source})}
+        )
+
        retriever = RetrieverCreator.create_retriever(
            retriever_name,
            question=question,
@@ -375,6 +393,10 @@ def api_answer():
            elif "answer" in line:
                response_full += line["answer"]

+        if data.get("isNoneDoc"):
+            for doc in source_log_docs:
+                doc["source"] = "None"
+
        llm = LLMCreator.create_llm(
            settings.LLM_NAME, api_key=settings.API_KEY, user_api_key=user_api_key
        )
@@ -386,16 +408,15 @@ def api_answer():

        return result
    except Exception as e:
-        # print whole traceback
-        traceback.print_exc()
-        print(str(e))
+        current_app.logger.error(f"/api/answer - error: {str(e)} - traceback: {traceback.format_exc()}",
+          extra={"error": str(e), "traceback": traceback.format_exc()}
+        )
        return bad_request(500, str(e))


@answer.route("/api/search", methods=["POST"])
 def api_search():
    data = request.get_json()
-    # get parameter from url question
    question = data["question"]
    if "chunks" in data:
        chunks = int(data["chunks"])
@@ -413,10 +434,7 @@ def api_search():
        source = {}
        user_api_key = None

-    if (
-        source["active_docs"].split("/")[0] == "default"
-        or source["active_docs"].split("/")[0] == "local"
-    ):
+    if source["active_docs"].split("/")[0] in ["default", "local"]:
        retriever_name = "classic"
    else:
        retriever_name = source["active_docs"]
@@ -424,6 +442,10 @@ def api_search():
        token_limit = data["token_limit"]
    else:
        token_limit = settings.DEFAULT_MAX_HISTORY
+        
+    current_app.logger.info(f"/api/answer - request_data: {data}, source: {source}",
+            extra={"data": json.dumps({"request_data": data, "source": source})}
+    )

    retriever = RetrieverCreator.create_retriever(
        retriever_name,
@@ -437,4 +459,9 @@ def api_search():
        user_api_key=user_api_key,
    )
    docs = retriever.search()
+
+    if data.get("isNoneDoc"):
+        for doc in docs:
+            doc["source"] = "None"
+
    return docs
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -44,7 +44,7 @@ def delete_conversation():
    return {"status": "ok"}


-@user.route("/api/delete_all_conversations", methods=["POST"])
+@user.route("/api/delete_all_conversations", methods=["GET"])
 def delete_all_conversations():
    user_id = "local"
    conversations_collection.delete_many({"user": user_id})
@@ -256,7 +256,7 @@ def combined_json():
            "docLink": "default",
            "model": settings.EMBEDDINGS_NAME,
            "location": "remote",
-            "tokens":""
+            "tokens": "",
        }
    ]
    # structure: name, language, version, description, fullName, date, docLink
@@ -273,7 +273,7 @@ def combined_json():
                "docLink": index["location"],
                "model": settings.EMBEDDINGS_NAME,
                "location": "local",
-                "tokens" : index["tokens"] if ("tokens" in index.keys()) else ""
+                "tokens": index["tokens"] if ("tokens" in index.keys()) else "",
            }
        )
    if settings.VECTOR_STORE == "faiss":
@@ -295,7 +295,7 @@ def combined_json():
                "docLink": "duckduck_search",
                "model": settings.EMBEDDINGS_NAME,
                "location": "custom",
-                "tokens":""
+                "tokens": "",
            }
        )
    if "brave_search" in settings.RETRIEVERS_ENABLED:
@@ -310,7 +310,7 @@ def combined_json():
                "docLink": "brave_search",
                "model": settings.EMBEDDINGS_NAME,
                "location": "custom",
-                "tokens":""
+                "tokens": "",
            }
        )

@@ -496,138 +496,204 @@ def delete_api_key():
    return {"status": "ok"}


-#route to share conversation
+# route to share conversation
 ##isPromptable should be passed through queries
-@user.route("/api/share",methods=["POST"])
+@user.route("/api/share", methods=["POST"])
 def share_conversation():
    try:
        data = request.get_json()
        user = "local" if "user" not in data else data["user"]
        conversation_id = data["conversation_id"]
        isPromptable = request.args.get("isPromptable").lower() == "true"
-        
-        conversation = conversations_collection.find_one({"_id": ObjectId(conversation_id)})
+
+        conversation = conversations_collection.find_one(
+            {"_id": ObjectId(conversation_id)}
+        )
        current_n_queries = len(conversation["queries"])
-        
-         ##generate binary representation of uuid
+
+        ##generate binary representation of uuid
        explicit_binary = Binary.from_uuid(uuid.uuid4(), UuidRepresentation.STANDARD)
-        
-        if(isPromptable):
+
+        if isPromptable:
            source = "default" if "source" not in data else data["source"]
            prompt_id = "default" if "prompt_id" not in data else data["prompt_id"]
            chunks = "2" if "chunks" not in data else data["chunks"]
-            
-            name = conversation["name"]+"(shared)"
-            pre_existing_api_document = api_key_collection.find_one({
-                    "prompt_id":prompt_id,
-                    "chunks":chunks,
-                    "source":source,
-                    "user":user    
-                })
+
+            name = conversation["name"] + "(shared)"
+            pre_existing_api_document = api_key_collection.find_one(
+                {
+                    "prompt_id": prompt_id,
+                    "chunks": chunks,
+                    "source": source,
+                    "user": user,
+                }
+            )
            api_uuid = str(uuid.uuid4())
-            if(pre_existing_api_document):
-                 api_uuid = pre_existing_api_document["key"]
-                 pre_existing = shared_conversations_collections.find_one({
-                    "conversation_id":DBRef("conversations",ObjectId(conversation_id)),
-                    "isPromptable":isPromptable,
-                    "first_n_queries":current_n_queries,
-                    "user":user,
-                    "api_key":api_uuid
-                })
-                 if(pre_existing is not None):
-                     return jsonify({"success":True, "identifier":str(pre_existing["uuid"].as_uuid())}),200
-                 else:
-                     shared_conversations_collections.insert_one({
-                      "uuid":explicit_binary,
-                      "conversation_id": {
-                       "$ref":"conversations",
-                       "$id":ObjectId(conversation_id)
-                      } ,
-                     "isPromptable":isPromptable,
-                     "first_n_queries":current_n_queries,
-                     "user":user,
-                    "api_key":api_uuid
-                 })
-                     return jsonify({"success":True,"identifier":str(explicit_binary.as_uuid())})
+            if pre_existing_api_document:
+                api_uuid = pre_existing_api_document["key"]
+                pre_existing = shared_conversations_collections.find_one(
+                    {
+                        "conversation_id": DBRef(
+                            "conversations", ObjectId(conversation_id)
+                        ),
+                        "isPromptable": isPromptable,
+                        "first_n_queries": current_n_queries,
+                        "user": user,
+                        "api_key": api_uuid,
+                    }
+                )
+                if pre_existing is not None:
+                    return (
+                        jsonify(
+                            {
+                                "success": True,
+                                "identifier": str(pre_existing["uuid"].as_uuid()),
+                            }
+                        ),
+                        200,
+                    )
+                else:
+                    shared_conversations_collections.insert_one(
+                        {
+                            "uuid": explicit_binary,
+                            "conversation_id": {
+                                "$ref": "conversations",
+                                "$id": ObjectId(conversation_id),
+                            },
+                            "isPromptable": isPromptable,
+                            "first_n_queries": current_n_queries,
+                            "user": user,
+                            "api_key": api_uuid,
+                        }
+                    )
+                    return jsonify(
+                        {"success": True, "identifier": str(explicit_binary.as_uuid())}
+                    )
            else:
                api_key_collection.insert_one(
-                   {
-                   "name": name,
-                   "key": api_uuid,
-                   "source": source,
-                   "user": user,
-                   "prompt_id": prompt_id,
-                   "chunks": chunks,
-                 } 
-               )       
-            shared_conversations_collections.insert_one({
-                "uuid":explicit_binary,
-                "conversation_id": {
-                  "$ref":"conversations",
-                  "$id":ObjectId(conversation_id)
-               } ,
-               "isPromptable":isPromptable,
-               "first_n_queries":current_n_queries,
-                "user":user,
-               "api_key":api_uuid
-           })
+                    {
+                        "name": name,
+                        "key": api_uuid,
+                        "source": source,
+                        "user": user,
+                        "prompt_id": prompt_id,
+                        "chunks": chunks,
+                    }
+                )
+            shared_conversations_collections.insert_one(
+                {
+                    "uuid": explicit_binary,
+                    "conversation_id": {
+                        "$ref": "conversations",
+                        "$id": ObjectId(conversation_id),
+                    },
+                    "isPromptable": isPromptable,
+                    "first_n_queries": current_n_queries,
+                    "user": user,
+                    "api_key": api_uuid,
+                }
+            )
            ## Identifier as route parameter in frontend
-            return jsonify({"success":True, "identifier":str(explicit_binary.as_uuid())}),201
-        
-        ##isPromptable = False
-        pre_existing = shared_conversations_collections.find_one({
-            "conversation_id":DBRef("conversations",ObjectId(conversation_id)),
-            "isPromptable":isPromptable,
-            "first_n_queries":current_n_queries,
-            "user":user
-        })
-        if(pre_existing is not None):
-            return jsonify({"success":True, "identifier":str(pre_existing["uuid"].as_uuid())}),200
-        else:     
-           shared_conversations_collections.insert_one({
-           "uuid":explicit_binary,
-           "conversation_id": {
-                  "$ref":"conversations",
-                  "$id":ObjectId(conversation_id)
-                  } ,
-                    "isPromptable":isPromptable,
-                    "first_n_queries":current_n_queries,
-                    "user":user
-           })
-            ## Identifier as route parameter in frontend
-           return jsonify({"success":True, "identifier":str(explicit_binary.as_uuid())}),201
-    except Exception  as err:
-        print (err)
-        return jsonify({"success":False,"error":str(err)}),400
+            return (
+                jsonify(
+                    {"success": True, "identifier": str(explicit_binary.as_uuid())}
+                ),
+                201,
+            )

-#route to get publicly shared conversations
-@user.route("/api/shared_conversation/<string:identifier>",methods=["GET"])
-def get_publicly_shared_conversations(identifier : str):
-    try:
-        query_uuid = Binary.from_uuid(uuid.UUID(identifier), UuidRepresentation.STANDARD)
-        shared = shared_conversations_collections.find_one({"uuid":query_uuid})
-        conversation_queries=[]
-        if shared and 'conversation_id' in shared and isinstance(shared['conversation_id'], DBRef):
-        # Resolve the DBRef
-            conversation_ref = shared['conversation_id']
-            conversation = db.dereference(conversation_ref)
-            if(conversation is None):
-                return jsonify({"sucess":False,"error":"might have broken url or the conversation does not exist"}),404
-            conversation_queries = conversation['queries'][:(shared["first_n_queries"])]
-            for query in conversation_queries:
-                query.pop("sources") ## avoid exposing sources
+        ##isPromptable = False
+        pre_existing = shared_conversations_collections.find_one(
+            {
+                "conversation_id": DBRef("conversations", ObjectId(conversation_id)),
+                "isPromptable": isPromptable,
+                "first_n_queries": current_n_queries,
+                "user": user,
+            }
+        )
+        if pre_existing is not None:
+            return (
+                jsonify(
+                    {"success": True, "identifier": str(pre_existing["uuid"].as_uuid())}
+                ),
+                200,
+            )
        else:
-            return jsonify({"sucess":False,"error":"might have broken url or the conversation does not exist"}),404
+            shared_conversations_collections.insert_one(
+                {
+                    "uuid": explicit_binary,
+                    "conversation_id": {
+                        "$ref": "conversations",
+                        "$id": ObjectId(conversation_id),
+                    },
+                    "isPromptable": isPromptable,
+                    "first_n_queries": current_n_queries,
+                    "user": user,
+                }
+            )
+            ## Identifier as route parameter in frontend
+            return (
+                jsonify(
+                    {"success": True, "identifier": str(explicit_binary.as_uuid())}
+                ),
+                201,
+            )
+    except Exception as err:
+        print(err)
+        return jsonify({"success": False, "error": str(err)}), 400
+
+
+# route to get publicly shared conversations
+@user.route("/api/shared_conversation/<string:identifier>", methods=["GET"])
+def get_publicly_shared_conversations(identifier: str):
+    try:
+        query_uuid = Binary.from_uuid(
+            uuid.UUID(identifier), UuidRepresentation.STANDARD
+        )
+        shared = shared_conversations_collections.find_one({"uuid": query_uuid})
+        conversation_queries = []
+        if (
+            shared
+            and "conversation_id" in shared
+            and isinstance(shared["conversation_id"], DBRef)
+        ):
+            # Resolve the DBRef
+            conversation_ref = shared["conversation_id"]
+            conversation = db.dereference(conversation_ref)
+            if conversation is None:
+                return (
+                    jsonify(
+                        {
+                            "sucess": False,
+                            "error": "might have broken url or the conversation does not exist",
+                        }
+                    ),
+                    404,
+                )
+            conversation_queries = conversation["queries"][
+                : (shared["first_n_queries"])
+            ]
+            for query in conversation_queries:
+                query.pop("sources")  ## avoid exposing sources
+        else:
+            return (
+                jsonify(
+                    {
+                        "sucess": False,
+                        "error": "might have broken url or the conversation does not exist",
+                    }
+                ),
+                404,
+            )
        date = conversation["_id"].generation_time.isoformat()
        res = {
-            "success":True,
-            "queries":conversation_queries,
-            "title":conversation["name"],
-            "timestamp":date
-            }
-        if(shared["isPromptable"] and "api_key" in shared):
+            "success": True,
+            "queries": conversation_queries,
+            "title": conversation["name"],
+            "timestamp": date,
+        }
+        if shared["isPromptable"] and "api_key" in shared:
            res["api_key"] = shared["api_key"]
        return jsonify(res), 200
    except Exception as err:
-        print (err)
-        return jsonify({"success":False,"error":str(err)}),400
+        print(err)
+        return jsonify({"success": False, "error": str(err)}), 400
--- a/application/app.py
+++ b/application/app.py
@@ -6,12 +6,14 @@ from application.core.settings import settings
 from application.api.user.routes import user
 from application.api.answer.routes import answer
 from application.api.internal.routes import internal
+from application.core.logging_config import setup_logging

 if platform.system() == "Windows":
    import pathlib
    pathlib.PosixPath = pathlib.WindowsPath

 dotenv.load_dotenv()
+setup_logging()

 app = Flask(__name__)
 app.register_blueprint(user)
--- a/application/celery_init.py
+++ b/application/celery_init.py
@@ -1,9 +1,15 @@
 from celery import Celery
 from application.core.settings import settings
+from celery.signals import setup_logging

 def make_celery(app_name=__name__):
    celery = Celery(app_name, broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND)
    celery.conf.update(settings)
    return celery

+@setup_logging.connect
+def config_loggers(*args, **kwargs):
+    from application.core.logging_config import setup_logging
+    setup_logging()
+
 celery = make_celery()
--- a/application/core/logging_config.py
+++ b/application/core/logging_config.py
@@ -0,0 +1,22 @@
+from logging.config import dictConfig
+
+def setup_logging():
+    dictConfig({
+        'version': 1,
+        'formatters': {
+            'default': {
+                'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
+            }
+        },
+        "handlers": {
+            "console": {
+                "class": "logging.StreamHandler",
+                "stream": "ext://sys.stdout",
+                "formatter": "default",
+            }
+        },
+        'root': {
+            'level': 'INFO',
+            'handlers': ['console'],
+        },
+    })
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -29,6 +29,7 @@ class Settings(BaseSettings):
    OPENAI_API_VERSION: Optional[str] = None  # azure openai api version
    AZURE_DEPLOYMENT_NAME: Optional[str] = None  # azure deployment name for answering
    AZURE_EMBEDDINGS_DEPLOYMENT_NAME: Optional[str] = None  # azure deployment name for embeddings
+    OPENAI_BASE_URL: Optional[str] = None # openai base url for open ai compatable models

    # elasticsearch
    ELASTIC_CLOUD_ID: Optional[str] = None  # cloud id for elasticsearch
--- a/application/llm/openai.py
+++ b/application/llm/openai.py
@@ -2,25 +2,23 @@ from application.llm.base import BaseLLM
 from application.core.settings import settings


+
 class OpenAILLM(BaseLLM):

    def __init__(self, api_key=None, user_api_key=None, *args, **kwargs):
-        global openai
        from openai import OpenAI

        super().__init__(*args, **kwargs)
-        self.client = OpenAI(
-            api_key=api_key,
-        )
+        if settings.OPENAI_BASE_URL:
+            self.client = OpenAI(
+                api_key=api_key,
+                base_url=settings.OPENAI_BASE_URL
+            )
+        else:
+            self.client = OpenAI(api_key=api_key)
        self.api_key = api_key
        self.user_api_key = user_api_key

-    def _get_openai(self):
-        # Import openai when needed
-        import openai
-
-        return openai
-
    def _raw_gen(
        self,
        baseself,
@@ -29,7 +27,7 @@ class OpenAILLM(BaseLLM):
        stream=False,
        engine=settings.AZURE_DEPLOYMENT_NAME,
        **kwargs
-    ):
+    ):  
        response = self.client.chat.completions.create(
            model=model, messages=messages, stream=stream, **kwargs
        )
@@ -44,7 +42,7 @@ class OpenAILLM(BaseLLM):
        stream=True,
        engine=settings.AZURE_DEPLOYMENT_NAME,
        **kwargs
-    ):
+    ):  
        response = self.client.chat.completions.create(
            model=model, messages=messages, stream=stream, **kwargs
        )
@@ -73,8 +71,3 @@ class AzureOpenAILLM(OpenAILLM):
            api_base=settings.OPENAI_API_BASE,
            deployment_name=settings.AZURE_DEPLOYMENT_NAME,
        )
-
-    def _get_openai(self):
-        openai = super()._get_openai()
-
-        return openai
--- a/application/parser/file/html_parser.py
+++ b/application/parser/file/html_parser.py
@@ -3,7 +3,6 @@
 Contains parser for html files.

 """
-import re
 from pathlib import Path
 from typing import Dict, Union

@@ -18,66 +17,8 @@ class HTMLParser(BaseParser):
        return {}

    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
-        """Parse file.
+        from langchain_community.document_loaders import BSHTMLLoader

-            Returns:
-            Union[str, List[str]]: a string or a List of strings.
-        """
-        try:
-            from unstructured.partition.html import partition_html
-            from unstructured.staging.base import convert_to_isd
-            from unstructured.cleaners.core import clean
-        except ImportError:
-            raise ValueError("unstructured package is required to parse HTML files.")
-
-        # Using the unstructured library to convert the html to isd format
-        # isd sample : isd = [
-        #   {"text": "My Title", "type": "Title"},
-        #   {"text": "My Narrative", "type": "NarrativeText"}
-        # ]
-        with open(file, "r", encoding="utf-8") as fp:
-            elements = partition_html(file=fp)
-            isd = convert_to_isd(elements)
-
-            # Removing non ascii charactwers from isd_el['text']
-        for isd_el in isd:
-            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
-
-        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
-        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
-        for isd_el in isd:
-            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
-            isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
-
-        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
-        for isd_el in isd:
-            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
-
-        # Creating a list of all the indexes of isd_el['type'] = 'Title'
-        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
-
-        # Creating 'Chunks' - List of lists of strings 
-        # each list starting with isd_el['type'] = 'Title' and all the data till the next 'Title'
-        # Each Chunk can be thought of as an individual set of data, which can be sent to the model
-        # Where Each Title is grouped together with the data under it
-
-        Chunks = [[]]
-        final_chunks = list(list())
-
-        for i, isd_el in enumerate(isd):
-            if i in title_indexes:
-                Chunks.append([])
-            Chunks[-1].append(isd_el['text'])
-
-        # Removing all the chunks with sum of length of all the strings in the chunk < 25
-        # TODO: This value can be an user defined variable
-        for chunk in Chunks:
-            # sum of length of all the strings in the chunk
-            sum = 0
-            sum += len(str(chunk))
-            if sum < 25:
-                Chunks.remove(chunk)
-            else:
-                # appending all the approved chunks to final_chunks as a single string       
-                final_chunks.append(" ".join([str(item) for item in chunk]))
-        return final_chunks
+        loader = BSHTMLLoader(file)
+        data = loader.load()        
+        return data
--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -5,7 +5,7 @@ from application.parser.remote.base import BaseRemote

 class CrawlerLoader(BaseRemote):
    def __init__(self, limit=10):
-        from langchain.document_loaders import WebBaseLoader
+        from langchain_community.document_loaders import WebBaseLoader
        self.loader = WebBaseLoader  # Initialize the document loader
        self.limit = limit  # Set the limit for the number of pages to scrape

--- a/application/parser/remote/sitemap_loader.py
+++ b/application/parser/remote/sitemap_loader.py
@@ -5,7 +5,7 @@ from application.parser.remote.base import BaseRemote

 class SitemapLoader(BaseRemote):
    def __init__(self, limit=20):
-        from langchain.document_loaders import WebBaseLoader
+        from langchain_community.document_loaders import WebBaseLoader
        self.loader = WebBaseLoader
        self.limit = limit  # Adding limit to control the number of URLs to process

--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -1,16 +1,17 @@
-anthropic==0.12.0
-boto3==1.34.6
+anthropic==0.34.0
+boto3==1.34.153
+beautifulsoup4==4.12.3
 celery==5.3.6
-dataclasses_json==0.6.3
+dataclasses_json==0.6.7
 docx2txt==0.8
-duckduckgo-search==5.3.0
+duckduckgo-search==6.2.6
 EbookLib==0.18
-elasticsearch==8.12.0
+elasticsearch==8.14.0
 escodegen==1.0.11
 esprima==4.0.1
-faiss-cpu==1.8.0.post1
 Flask==3.0.1
-gunicorn==22.0.0
+faiss-cpu==1.8.0.post1
+gunicorn==23.0.0
 html2text==2020.1.16
 javalang==0.13.0
 langchain==0.2.16
@@ -18,12 +19,12 @@ langchain-community==0.2.16
 langchain-core==0.2.38
 langchain-openai==0.1.23
 openapi3_parser==1.1.16
-pandas==2.2.0
-pydantic_settings==2.1.0
-pymongo==4.6.3
+pandas==2.2.2
+pydantic_settings==2.4.0
+pymongo==4.8.0
 PyPDF2==3.0.1
 python-dotenv==1.0.1
-qdrant-client==1.9.0
+qdrant-client==1.11.0
 redis==5.0.1
 Requests==2.32.0
 retry==0.9.2
@@ -31,6 +32,5 @@ sentence-transformers
 tiktoken==0.7.0
 torch
 tqdm==4.66.3
-transformers==4.36.2
-unstructured==0.12.2
+transformers==4.44.0
 Werkzeug==3.0.3
--- a/application/retriever/brave_search.py
+++ b/application/retriever/brave_search.py
@@ -2,7 +2,7 @@ import json
 from application.retriever.base import BaseRetriever
 from application.core.settings import settings
 from application.llm.llm_creator import LLMCreator
-from application.utils import count_tokens
+from application.utils import num_tokens_from_string
 from langchain_community.tools import BraveSearch


@@ -78,7 +78,7 @@ class BraveRetSearch(BaseRetriever):
            self.chat_history.reverse()
            for i in self.chat_history:
                if "prompt" in i and "response" in i:
-                    tokens_batch = count_tokens(i["prompt"]) + count_tokens(
+                    tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
                        i["response"]
                    )
                    if tokens_current_history + tokens_batch < self.token_limit:
--- a/application/retriever/classic_rag.py
+++ b/application/retriever/classic_rag.py
@@ -4,7 +4,7 @@ from application.core.settings import settings
 from application.vectorstore.vector_creator import VectorCreator
 from application.llm.llm_creator import LLMCreator

-from application.utils import count_tokens
+from application.utils import num_tokens_from_string


 class ClassicRAG(BaseRetriever):
@@ -98,7 +98,7 @@ class ClassicRAG(BaseRetriever):
            self.chat_history.reverse()
            for i in self.chat_history:
                if "prompt" in i and "response" in i:
-                    tokens_batch = count_tokens(i["prompt"]) + count_tokens(
+                    tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
                        i["response"]
                    )
                    if tokens_current_history + tokens_batch < self.token_limit:
--- a/application/retriever/duckduck_search.py
+++ b/application/retriever/duckduck_search.py
@@ -1,7 +1,7 @@
 from application.retriever.base import BaseRetriever
 from application.core.settings import settings
 from application.llm.llm_creator import LLMCreator
-from application.utils import count_tokens
+from application.utils import num_tokens_from_string
 from langchain_community.tools import DuckDuckGoSearchResults
 from langchain_community.utilities import DuckDuckGoSearchAPIWrapper

@@ -95,7 +95,7 @@ class DuckDuckSearch(BaseRetriever):
            self.chat_history.reverse()
            for i in self.chat_history:
                if "prompt" in i and "response" in i:
-                    tokens_batch = count_tokens(i["prompt"]) + count_tokens(
+                    tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
                        i["response"]
                    )
                    if tokens_current_history + tokens_batch < self.token_limit:
--- a/application/usage.py
+++ b/application/usage.py
@@ -2,7 +2,7 @@ import sys
 from pymongo import MongoClient
 from datetime import datetime
 from application.core.settings import settings
-from application.utils import count_tokens
+from application.utils import num_tokens_from_string

 mongo = MongoClient(settings.MONGO_URI)
 db = mongo["docsgpt"]
@@ -24,9 +24,9 @@ def update_token_usage(user_api_key, token_usage):
 def gen_token_usage(func):
    def wrapper(self, model, messages, stream, **kwargs):
        for message in messages:
-            self.token_usage["prompt_tokens"] += count_tokens(message["content"])
+            self.token_usage["prompt_tokens"] += num_tokens_from_string(message["content"])
        result = func(self, model, messages, stream, **kwargs)
-        self.token_usage["generated_tokens"] += count_tokens(result)
+        self.token_usage["generated_tokens"] += num_tokens_from_string(result)
        update_token_usage(self.user_api_key, self.token_usage)
        return result

@@ -36,14 +36,14 @@ def gen_token_usage(func):
 def stream_token_usage(func):
    def wrapper(self, model, messages, stream, **kwargs):
        for message in messages:
-            self.token_usage["prompt_tokens"] += count_tokens(message["content"])
+            self.token_usage["prompt_tokens"] += num_tokens_from_string(message["content"])
        batch = []
        result = func(self, model, messages, stream, **kwargs)
        for r in result:
            batch.append(r)
            yield r
        for line in batch:
-            self.token_usage["generated_tokens"] += count_tokens(line)
+            self.token_usage["generated_tokens"] += num_tokens_from_string(line)
        update_token_usage(self.user_api_key, self.token_usage)

    return wrapper
--- a/application/utils.py
+++ b/application/utils.py
@@ -1,6 +1,22 @@
-from transformers import GPT2TokenizerFast
+import tiktoken

-tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
-tokenizer.model_max_length = 100000
-def count_tokens(string):
-    return len(tokenizer(string)['input_ids'])
+_encoding = None
+
+def get_encoding():
+    global _encoding
+    if _encoding is None:
+        _encoding = tiktoken.get_encoding("cl100k_base")
+    return _encoding
+
+def num_tokens_from_string(string: str) -> int:
+    encoding = get_encoding()
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
+
+def count_tokens_docs(docs):
+    docs_content = ""
+    for doc in docs:
+        docs_content += doc.page_content
+
+    tokens = num_tokens_from_string(docs_content)
+    return tokens
--- a/application/worker.py
+++ b/application/worker.py
@@ -2,8 +2,8 @@ import os
 import shutil
 import string
 import zipfile
-import tiktoken
 from urllib.parse import urljoin
+import logging

 import requests

@@ -13,6 +13,8 @@ from application.parser.remote.remote_creator import RemoteCreator
 from application.parser.open_ai_func import call_openai_api
 from application.parser.schema.base import Document
 from application.parser.token_func import group_split
+from application.utils import count_tokens_docs
+

 # Define a function to extract metadata from a given filename.
 def metadata_from_filename(title):
@@ -41,7 +43,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
        max_depth (int): Maximum allowed depth of recursion to prevent infinite loops.
    """
    if current_depth > max_depth:
-        print(f"Reached maximum recursion depth of {max_depth}")
+        logging.warning(f"Reached maximum recursion depth of {max_depth}")
        return

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
@@ -88,16 +90,13 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
    max_tokens = 1250
    recursion_depth = 2
    full_path = os.path.join(directory, user, name_job)
-    import sys

-    print(full_path, file=sys.stderr)
+    logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job})
    # check if API_URL env variable is set
    file_data = {"name": name_job, "file": filename, "user": user}
    response = requests.get(
        urljoin(settings.API_URL, "/api/download"), params=file_data
    )
-    # check if file is in the response
-    print(response, file=sys.stderr)
    file = response.content

    if not os.path.exists(full_path):
@@ -137,7 +136,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):

    if sample:
        for i in range(min(5, len(raw_docs))):
-            print(raw_docs[i].text)
+            logging.info(f"Sample document {i}: {raw_docs[i]}")

    # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
    # and send them to the server (provide user and name in form)
@@ -180,6 +179,7 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
    if not os.path.exists(full_path):
        os.makedirs(full_path)
    self.update_state(state="PROGRESS", meta={"current": 1})
+    logging.info(f"Remote job: {full_path}", extra={"user": user, "job": name_job, source_data: source_data})

    remote_loader = RemoteCreator.create_loader(loader)
    raw_docs = remote_loader.load_data(source_data)
@@ -212,26 +212,4 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):

    shutil.rmtree(full_path)

-    return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
-
-
-def count_tokens_docs(docs):
-    # Here we convert the docs list to a string and calculate the number of tokens the string represents.
-    # docs_content = (" ".join(docs))
-    docs_content = ""
-    for doc in docs:
-        docs_content += doc.page_content
-
-    tokens, total_price = num_tokens_from_string(
-        string=docs_content, encoding_name="cl100k_base"
-    )
-    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
-    return tokens
-
-
-def num_tokens_from_string(string: str, encoding_name: str) -> int:
-    # Function to convert string to tokens and estimate user cost.
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    total_price = (num_tokens / 1000) * 0.0004
-    return num_tokens, total_price
+    return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}