Merge branch 'main' into feature/remote-loads

2026-03-07 14:24:35 +00:00 · 2024-03-01 14:38:27 +00:00
parent 54d187a0ad 8d36f8850e
commit 4a701cb993
17 changed files with 181 additions and 52 deletions
--- a/application/Dockerfile
+++ b/application/Dockerfile
@@ -2,15 +2,17 @@ FROM python:3.11-slim-bullseye as builder

 # Tiktoken requires Rust toolchain, so build it in a separate stage
 RUN apt-get update && apt-get install -y gcc curl
+RUN apt-get install -y wget unzip
+RUN wget https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.zip
+RUN unzip mpnet-base-v2.zip -d model
+RUN rm mpnet-base-v2.zip
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && apt-get install --reinstall libc6-dev -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN pip install --upgrade pip && pip install tiktoken==0.5.2
 COPY requirements.txt .
 RUN pip install -r requirements.txt
-RUN apt-get install -y wget unzip
-RUN wget https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.zip
-RUN unzip mpnet-base-v2.zip -d model
-RUN rm mpnet-base-v2.zip
+
+

 FROM python:3.11-slim-bullseye

--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -39,6 +39,9 @@ class Settings(BaseSettings):
    SAGEMAKER_ACCESS_KEY: Optional[str] = None # SageMaker access key
    SAGEMAKER_SECRET_KEY: Optional[str] = None # SageMaker secret key

+    # prem ai project id    
+    PREMAI_PROJECT_ID: Optional[str] = None
+

 path = Path(__file__).parent.parent.absolute()
 settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8")
--- a/application/llm/docsgpt_provider.py
+++ b/application/llm/docsgpt_provider.py
@@ -20,7 +20,7 @@ class DocsGPTAPILLM(BaseLLM):
                "max_new_tokens": 30
            }
        )
-        response_clean = response.json()['a'].split("###")[0]
+        response_clean = response.json()['a'].replace("###", "")

        return response_clean

--- a/application/llm/llm_creator.py
+++ b/application/llm/llm_creator.py
@@ -4,6 +4,7 @@ from application.llm.huggingface import HuggingFaceLLM
 from application.llm.llama_cpp import LlamaCpp
 from application.llm.anthropic import AnthropicLLM
 from application.llm.docsgpt_provider import DocsGPTAPILLM
+from application.llm.premai import PremAILLM



@@ -15,7 +16,8 @@ class LLMCreator:
        'huggingface': HuggingFaceLLM,
        'llama.cpp': LlamaCpp,
        'anthropic': AnthropicLLM,
-        'docsgpt': DocsGPTAPILLM
+        'docsgpt': DocsGPTAPILLM,
+        'premai': PremAILLM,
    }

    @classmethod
--- a/application/llm/premai.py
+++ b/application/llm/premai.py
@@ -0,0 +1,33 @@
+from application.llm.base import BaseLLM
+from application.core.settings import settings
+
+class PremAILLM(BaseLLM):
+
+    def __init__(self, api_key):
+        from premai import Prem
+        
+        self.client = Prem(
+            api_key=api_key
+        )
+        self.api_key = api_key
+        self.project_id = settings.PREMAI_PROJECT_ID
+
+    def gen(self, model, engine, messages, stream=False, **kwargs):
+        response = self.client.chat.completions.create(model=model,
+            project_id=self.project_id,
+            messages=messages,
+            stream=stream,
+            **kwargs)
+
+        return response.choices[0].message["content"]
+
+    def gen_stream(self, model, engine, messages, stream=True, **kwargs):
+        response = self.client.chat.completions.create(model=model,
+            project_id=self.project_id,
+            messages=messages,
+            stream=stream,
+            **kwargs)
+
+        for line in response:
+            if line.choices[0].delta["content"] is not None:
+                yield line.choices[0].delta["content"]
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -147,12 +147,24 @@ class SimpleDirectoryReader(BaseReader):
                # do standard read
                with open(input_file, "r", errors=self.errors) as f:
                    data = f.read()
-            if isinstance(data, List):
-                data_list.extend(data)
-            else:
-                data_list.append(str(data))
+            # Prepare metadata for this file
            if self.file_metadata is not None:
-                metadata_list.append(self.file_metadata(str(input_file)))
+                file_metadata = self.file_metadata(str(input_file))
+            else:
+                # Provide a default empty metadata
+                file_metadata = {'title': '', 'store': ''}
+                # TODO: Find a case with no metadata and check if breaks anything 
+
+            if isinstance(data, List):
+                # Extend data_list with each item in the data list
+                data_list.extend([str(d) for d in data])
+                # For each item in the data list, add the file's metadata to metadata_list
+                metadata_list.extend([file_metadata for _ in data])
+            else:
+                # Add the single piece of data to data_list
+                data_list.append(str(data))
+                # Add the file's metadata to metadata_list
+                metadata_list.append(file_metadata)

        if concatenate:
            return [Document("\n".join(data_list))]
--- a/application/parser/token_func.py
+++ b/application/parser/token_func.py
@@ -21,16 +21,15 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
    for doc in documents:
        doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))

-        if current_group is None:
-            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
-                                     extra_info=doc.extra_info)
-        elif len(tiktoken.get_encoding("cl100k_base").encode(
-                current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
-            current_group.text += " " + doc.text
+        # Check if current group is empty or if the document can be added based on token count and matching metadata
+        if current_group is None or (len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len < min_tokens and current_group.extra_info == doc.extra_info):
+            if current_group is None:
+                current_group = doc  # Use the document directly to retain its metadata
+            else:
+                current_group.text += " " + doc.text  # Append text to the current group
        else:
            docs.append(current_group)
-            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
-                                     extra_info=doc.extra_info)
+            current_group = doc  # Start a new group with the current document

    if current_group is not None:
        docs.append(current_group)