Merge branch 'main' into feature/streaming

This commit is contained in:
Alex
2023-05-31 22:15:53 +01:00
committed by GitHub
6 changed files with 53 additions and 16 deletions

View File

@@ -24,9 +24,11 @@ from langchain.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
AIMessagePromptTemplate,
)
from pymongo import MongoClient
from werkzeug.utils import secure_filename
from langchain.llms import GPT4All
from core.settings import settings
from error import bad_request
@@ -108,6 +110,7 @@ def run_async_chain(chain, question, chat_history):
result["answer"] = answer
return result
def get_vectorstore(data):
if "active_docs" in data:
if data["active_docs"].split("/")[0] == "local":
@@ -134,6 +137,7 @@ def get_docsearch(vectorstore, embeddings_key):
docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key))
return docsearch
@celery.task(bind=True)
def ingest(self, directory, formats, name_job, filename, user):
resp = ingest_worker(self, directory, formats, name_job, filename, user)
@@ -216,17 +220,26 @@ def api_answer():
# Note if you have used other embeddings than OpenAI, you need to change the embeddings
docsearch = get_docsearch(vectorstore, embeddings_key)
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template,
template_format="jinja2")
q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
template_format="jinja2")
if settings.LLM_NAME == "openai_chat":
llm = ChatOpenAI(openai_api_key=api_key) # optional parameter: model_name="gpt-4"
messages_combine = [
SystemMessagePromptTemplate.from_template(chat_combine_template),
HumanMessagePromptTemplate.from_template("{question}")
]
messages_combine = [SystemMessagePromptTemplate.from_template(chat_combine_template)]
if history:
tokens_current_history = 0
tokens_max_history = 1000
#count tokens in history
history.reverse()
for i in history:
if "prompt" in i and "response" in i:
tokens_batch = llm.get_num_tokens(i["prompt"]) + llm.get_num_tokens(i["response"])
if tokens_current_history + tokens_batch < tokens_max_history:
tokens_current_history += tokens_batch
messages_combine.append(HumanMessagePromptTemplate.from_template(i["prompt"]))
messages_combine.append(AIMessagePromptTemplate.from_template(i["response"]))
messages_combine.append(HumanMessagePromptTemplate.from_template("{question}"))
import sys
print(messages_combine, file=sys.stderr)
p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
elif settings.LLM_NAME == "openai":
llm = OpenAI(openai_api_key=api_key, temperature=0)
@@ -236,6 +249,8 @@ def api_answer():
llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key)
elif settings.LLM_NAME == "cohere":
llm = Cohere(model="command-xlarge-nightly", cohere_api_key=api_key)
elif settings.LLM_NAME == "gpt4all":
llm = GPT4All(model=settings.MODEL_PATH)
else:
raise ValueError("unknown LLM model")
@@ -251,9 +266,22 @@ def api_answer():
# result = chain({"question": question, "chat_history": chat_history})
# generate async with async generate method
result = run_async_chain(chain, question, chat_history)
elif settings.LLM_NAME == "gpt4all":
question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
doc_chain = load_qa_chain(llm, chain_type="map_reduce", combine_prompt=p_chat_combine)
chain = ConversationalRetrievalChain(
retriever=docsearch.as_retriever(k=2),
question_generator=question_generator,
combine_docs_chain=doc_chain,
)
chat_history = []
# result = chain({"question": question, "chat_history": chat_history})
# generate async with async generate method
result = run_async_chain(chain, question, chat_history)
else:
qa_chain = load_qa_chain(llm=llm, chain_type="map_reduce",
combine_prompt=c_prompt, question_prompt=q_prompt)
combine_prompt=chat_combine_template, question_prompt=q_prompt)
chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=docsearch, k=3)
result = chain({"query": question})

View File

@@ -9,6 +9,7 @@ class Settings(BaseSettings):
CELERY_BROKER_URL: str = "redis://localhost:6379/0"
CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
MONGO_URI: str = "mongodb://localhost:27017/docsgpt"
MODEL_PATH: str = "./models/gpt4all-model.bin"
API_URL: str = "http://localhost:5001" # backend url for celery worker

View File

@@ -1,5 +1,6 @@
You are a DocsGPT, friendly and helpful AI assistant by Arc53 that provides help with documents. You give thorough answers with code examples if possible.
Use the following pieces of context to help answer the users question. If its not relevant to the question, provide friendly responses.
You have access to chat history, and can use it to help answer the question.
When using code examples, use the following format:
```(language)
(code)

View File

@@ -26,10 +26,12 @@ ecdsa==0.18.0
entrypoints==0.4
faiss-cpu==1.7.3
filelock==3.9.0
Flask==2.3.2
Flask==2.2.3
Flask-Cors==3.0.10
frozenlist==1.3.3
geojson==2.5.0
greenlet==2.0.2
gpt4all==0.1.7
hub==3.0.1
huggingface-hub==0.12.1
humbug==0.2.8
@@ -39,7 +41,8 @@ Jinja2==3.1.2
jmespath==1.0.1
joblib==1.2.0
kombu==5.2.4
langchain==0.0.126
langchain==0.0.179
loguru==0.6.0
lxml==4.9.2
MarkupSafe==2.1.2
marshmallow==3.19.0

View File

@@ -1,8 +1,5 @@
"""Simple reader that reads files of different formats from a directory."""
import logging
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
from parser.file.base import BaseReader
from parser.file.base_parser import BaseParser
from parser.file.docs_parser import DocxParser, PDFParser
@@ -12,6 +9,8 @@ from parser.file.markdown_parser import MarkdownParser
from parser.file.rst_parser import RstParser
from parser.file.tabular_parser import PandasCSVParser
from parser.schema.base import Document
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".pdf": PDFParser(),
@@ -151,11 +150,16 @@ class SimpleDirectoryReader(BaseReader):
data = f.read()
if isinstance(data, List):
data_list.extend(data)
if self.file_metadata is not None:
for _ in range(len(data)):
metadata_list.append(self.file_metadata(str(input_file)))
else:
data_list.append(str(data))
if self.file_metadata is not None:
metadata_list.append(self.file_metadata(str(input_file)))
if concatenate:
return [Document("\n".join(data_list))]
elif self.file_metadata is not None: