mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
Merge branch 'main' into feature/streaming
This commit is contained in:
@@ -24,9 +24,11 @@ from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
AIMessagePromptTemplate,
|
||||
)
|
||||
from pymongo import MongoClient
|
||||
from werkzeug.utils import secure_filename
|
||||
from langchain.llms import GPT4All
|
||||
|
||||
from core.settings import settings
|
||||
from error import bad_request
|
||||
@@ -108,6 +110,7 @@ def run_async_chain(chain, question, chat_history):
|
||||
result["answer"] = answer
|
||||
return result
|
||||
|
||||
|
||||
def get_vectorstore(data):
|
||||
if "active_docs" in data:
|
||||
if data["active_docs"].split("/")[0] == "local":
|
||||
@@ -134,6 +137,7 @@ def get_docsearch(vectorstore, embeddings_key):
|
||||
docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key))
|
||||
return docsearch
|
||||
|
||||
|
||||
@celery.task(bind=True)
|
||||
def ingest(self, directory, formats, name_job, filename, user):
|
||||
resp = ingest_worker(self, directory, formats, name_job, filename, user)
|
||||
@@ -216,17 +220,26 @@ def api_answer():
|
||||
# Note if you have used other embeddings than OpenAI, you need to change the embeddings
|
||||
docsearch = get_docsearch(vectorstore, embeddings_key)
|
||||
|
||||
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template,
|
||||
template_format="jinja2")
|
||||
|
||||
q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
|
||||
template_format="jinja2")
|
||||
if settings.LLM_NAME == "openai_chat":
|
||||
llm = ChatOpenAI(openai_api_key=api_key) # optional parameter: model_name="gpt-4"
|
||||
messages_combine = [
|
||||
SystemMessagePromptTemplate.from_template(chat_combine_template),
|
||||
HumanMessagePromptTemplate.from_template("{question}")
|
||||
]
|
||||
messages_combine = [SystemMessagePromptTemplate.from_template(chat_combine_template)]
|
||||
if history:
|
||||
tokens_current_history = 0
|
||||
tokens_max_history = 1000
|
||||
#count tokens in history
|
||||
history.reverse()
|
||||
for i in history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = llm.get_num_tokens(i["prompt"]) + llm.get_num_tokens(i["response"])
|
||||
if tokens_current_history + tokens_batch < tokens_max_history:
|
||||
tokens_current_history += tokens_batch
|
||||
messages_combine.append(HumanMessagePromptTemplate.from_template(i["prompt"]))
|
||||
messages_combine.append(AIMessagePromptTemplate.from_template(i["response"]))
|
||||
messages_combine.append(HumanMessagePromptTemplate.from_template("{question}"))
|
||||
import sys
|
||||
print(messages_combine, file=sys.stderr)
|
||||
p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
|
||||
elif settings.LLM_NAME == "openai":
|
||||
llm = OpenAI(openai_api_key=api_key, temperature=0)
|
||||
@@ -236,6 +249,8 @@ def api_answer():
|
||||
llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key)
|
||||
elif settings.LLM_NAME == "cohere":
|
||||
llm = Cohere(model="command-xlarge-nightly", cohere_api_key=api_key)
|
||||
elif settings.LLM_NAME == "gpt4all":
|
||||
llm = GPT4All(model=settings.MODEL_PATH)
|
||||
else:
|
||||
raise ValueError("unknown LLM model")
|
||||
|
||||
@@ -251,9 +266,22 @@ def api_answer():
|
||||
# result = chain({"question": question, "chat_history": chat_history})
|
||||
# generate async with async generate method
|
||||
result = run_async_chain(chain, question, chat_history)
|
||||
elif settings.LLM_NAME == "gpt4all":
|
||||
question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
|
||||
doc_chain = load_qa_chain(llm, chain_type="map_reduce", combine_prompt=p_chat_combine)
|
||||
chain = ConversationalRetrievalChain(
|
||||
retriever=docsearch.as_retriever(k=2),
|
||||
question_generator=question_generator,
|
||||
combine_docs_chain=doc_chain,
|
||||
)
|
||||
chat_history = []
|
||||
# result = chain({"question": question, "chat_history": chat_history})
|
||||
# generate async with async generate method
|
||||
result = run_async_chain(chain, question, chat_history)
|
||||
|
||||
else:
|
||||
qa_chain = load_qa_chain(llm=llm, chain_type="map_reduce",
|
||||
combine_prompt=c_prompt, question_prompt=q_prompt)
|
||||
combine_prompt=chat_combine_template, question_prompt=q_prompt)
|
||||
chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=docsearch, k=3)
|
||||
result = chain({"query": question})
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ class Settings(BaseSettings):
|
||||
CELERY_BROKER_URL: str = "redis://localhost:6379/0"
|
||||
CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
|
||||
MONGO_URI: str = "mongodb://localhost:27017/docsgpt"
|
||||
MODEL_PATH: str = "./models/gpt4all-model.bin"
|
||||
|
||||
API_URL: str = "http://localhost:5001" # backend url for celery worker
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
You are a DocsGPT, friendly and helpful AI assistant by Arc53 that provides help with documents. You give thorough answers with code examples if possible.
|
||||
Use the following pieces of context to help answer the users question. If its not relevant to the question, provide friendly responses.
|
||||
You have access to chat history, and can use it to help answer the question.
|
||||
When using code examples, use the following format:
|
||||
```(language)
|
||||
(code)
|
||||
|
||||
@@ -26,10 +26,12 @@ ecdsa==0.18.0
|
||||
entrypoints==0.4
|
||||
faiss-cpu==1.7.3
|
||||
filelock==3.9.0
|
||||
Flask==2.3.2
|
||||
Flask==2.2.3
|
||||
Flask-Cors==3.0.10
|
||||
frozenlist==1.3.3
|
||||
geojson==2.5.0
|
||||
greenlet==2.0.2
|
||||
gpt4all==0.1.7
|
||||
hub==3.0.1
|
||||
huggingface-hub==0.12.1
|
||||
humbug==0.2.8
|
||||
@@ -39,7 +41,8 @@ Jinja2==3.1.2
|
||||
jmespath==1.0.1
|
||||
joblib==1.2.0
|
||||
kombu==5.2.4
|
||||
langchain==0.0.126
|
||||
langchain==0.0.179
|
||||
loguru==0.6.0
|
||||
lxml==4.9.2
|
||||
MarkupSafe==2.1.2
|
||||
marshmallow==3.19.0
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
"""Simple reader that reads files of different formats from a directory."""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
|
||||
from parser.file.base import BaseReader
|
||||
from parser.file.base_parser import BaseParser
|
||||
from parser.file.docs_parser import DocxParser, PDFParser
|
||||
@@ -12,6 +9,8 @@ from parser.file.markdown_parser import MarkdownParser
|
||||
from parser.file.rst_parser import RstParser
|
||||
from parser.file.tabular_parser import PandasCSVParser
|
||||
from parser.schema.base import Document
|
||||
from pathlib import Path
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
|
||||
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
||||
".pdf": PDFParser(),
|
||||
@@ -151,11 +150,16 @@ class SimpleDirectoryReader(BaseReader):
|
||||
data = f.read()
|
||||
if isinstance(data, List):
|
||||
data_list.extend(data)
|
||||
if self.file_metadata is not None:
|
||||
for _ in range(len(data)):
|
||||
metadata_list.append(self.file_metadata(str(input_file)))
|
||||
else:
|
||||
data_list.append(str(data))
|
||||
if self.file_metadata is not None:
|
||||
metadata_list.append(self.file_metadata(str(input_file)))
|
||||
|
||||
|
||||
|
||||
if concatenate:
|
||||
return [Document("\n".join(data_list))]
|
||||
elif self.file_metadata is not None:
|
||||
|
||||
Reference in New Issue
Block a user