From 9228005a7e3de5ae188b5cf2b8764cdf7ba2684f Mon Sep 17 00:00:00 2001
From: Alex
Date: Sun, 12 Feb 2023 16:25:01 +0000
Subject: [PATCH 01/12] chunked embedding
---
scripts/parser/open_ai_func.py | 28 +++++++++++++++++++++++-----
1 file changed, 23 insertions(+), 5 deletions(-)
diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py
index 500e4888..00c57be9 100644
--- a/scripts/parser/open_ai_func.py
+++ b/scripts/parser/open_ai_func.py
@@ -14,12 +14,30 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
def call_openai_api(docs):
# Function to create a vector store from the documents and save it to disk.
- store = FAISS.from_documents(docs, OpenAIEmbeddings())
- faiss.write_index(store.index, "docs.index")
- store.index = None
+ from tqdm import tqdm
+ docs_test = [docs[0]]
+ # remove the first element from docs
+ docs.pop(0)
+ # cut first n docs if you want to restart
+ #docs = docs[:n]
+ c1 = 0
+ store = FAISS.from_documents(docs_test, OpenAIEmbeddings())
+ for i in tqdm(docs, desc="Embedding ๐ฆ", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'):
+ try:
+ import time
+ store.add_texts([i.page_content], metadatas=[i.metadata])
+ except Exception as e:
+ print(e)
+ print("Error on ", i)
+ print("Saving progress")
+ print(f"stopped at {c1} out of {len(docs)}")
+ store.save_local("outputs")
+ print("Sleeping for 10 seconds and trying again")
+ time.sleep(10)
+ store.add_texts([i.page_content], metadatas=[i.metadata])
+ c1 += 1
- with open("faiss_store.pkl", "wb") as f:
- pickle.dump(store, f)
+ store.save_local("outputs")
def get_user_permission(docs):
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
From 205be538a33cc43fcf101a969743cd12ec190d99 Mon Sep 17 00:00:00 2001
From: Alex
Date: Sun, 12 Feb 2023 17:58:54 +0000
Subject: [PATCH 02/12] fix dbqa, with new chain type, also fix for doc export
---
application/app.py | 19 +++++++++++++++----
scripts/parser/open_ai_func.py | 11 +++++++++--
2 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/application/app.py b/application/app.py
index aa9089ed..c114c63b 100644
--- a/application/app.py
+++ b/application/app.py
@@ -5,8 +5,8 @@ import datetime
from flask import Flask, request, render_template
# os.environ["LANGCHAIN_HANDLER"] = "langchain"
import faiss
-from langchain import OpenAI
-from langchain.chains import VectorDBQAWithSourcesChain
+from langchain import OpenAI, VectorDBQA
+from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
import requests
@@ -69,11 +69,22 @@ def api_answer():
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template)
# create a chain with the prompt template and the store
- chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(openai_api_key=api_key, temperature=0), vectorstore=store, combine_prompt=c_prompt)
+ #chain = VectorDBQA.from_llm(llm=OpenAI(openai_api_key=api_key, temperature=0), vectorstore=store, combine_prompt=c_prompt)
+ # chain = VectorDBQA.from_chain_type(llm=OpenAI(openai_api_key=api_key, temperature=0), chain_type='map_reduce',
+ # vectorstore=store)
+
+ qa_chain = load_qa_chain(OpenAI(openai_api_key=api_key, temperature=0), chain_type="map_reduce",
+ combine_prompt=c_prompt)
+ chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=store)
+
+
+
# fetch the answer
- result = chain({"question": question})
+ result = chain({"query": question})
+ print(result)
# some formatting for the frontend
+ result['answer'] = result['result']
result['answer'] = result['answer'].replace("\\n", "
")
result['answer'] = result['answer'].replace("SOURCES:", "")
# mock result
diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py
index 00c57be9..cbd947ee 100644
--- a/scripts/parser/open_ai_func.py
+++ b/scripts/parser/open_ai_func.py
@@ -31,13 +31,20 @@ def call_openai_api(docs):
print("Error on ", i)
print("Saving progress")
print(f"stopped at {c1} out of {len(docs)}")
- store.save_local("outputs")
+ faiss.write_index(store.index, "docs.index")
+ store.index = None
+ with open("faiss_store.pkl", "wb") as f:
+ pickle.dump(store, f)
print("Sleeping for 10 seconds and trying again")
time.sleep(10)
store.add_texts([i.page_content], metadatas=[i.metadata])
c1 += 1
- store.save_local("outputs")
+
+ faiss.write_index(store.index, "docs.index")
+ store.index = None
+ with open("faiss_store.pkl", "wb") as f:
+ pickle.dump(store, f)
def get_user_permission(docs):
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
From b1a6ebffba593ae772344381b87ca173fe11e83a Mon Sep 17 00:00:00 2001
From: Pavel
Date: Sun, 12 Feb 2023 22:29:40 +0400
Subject: [PATCH 03/12] Directives + Interpreted
Some additional filters for rst parsing
---
scripts/parser/file/rst_parser.py | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/scripts/parser/file/rst_parser.py b/scripts/parser/file/rst_parser.py
index 0b887d4e..7c97b326 100644
--- a/scripts/parser/file/rst_parser.py
+++ b/scripts/parser/file/rst_parser.py
@@ -24,6 +24,8 @@ class RstParser(BaseParser):
remove_hyperlinks: bool = True,
remove_images: bool = True,
remove_table_excess: bool = True,
+ remove_interpreters: bool = True,
+ remove_directives: bool = True,
remove_whitespaces_excess: bool = True,
#Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
@@ -34,6 +36,8 @@ class RstParser(BaseParser):
self._remove_hyperlinks = remove_hyperlinks
self._remove_images = remove_images
self._remove_table_excess = remove_table_excess
+ self._remove_interpreters = remove_interpreters
+ self._remove_directives = remove_directives
self._remove_whitespaces_excess = remove_whitespaces_excess
self._remove_characters_excess = remove_characters_excess
@@ -95,6 +99,18 @@ class RstParser(BaseParser):
content = re.sub(pattern, r"\1", content)
return content
+ def remove_directives(self, content: str) -> str:
+ """Removes reStructuredText Directives"""
+ pattern = r"`\.\.([^:]+)::"
+ content = re.sub(pattern, "", content)
+ return content
+
+ def remove_interpreters(self, content: str) -> str:
+ """Removes reStructuredText Interpreted Text Roles"""
+ pattern = r":(\w+):"
+ content = re.sub(pattern, "", content)
+ return content
+
def remove_table_excess(self, content: str) -> str:
"""Pattern to remove grid table separators"""
pattern = r"^\+[-]+\+[-]+\+$"
@@ -129,6 +145,10 @@ class RstParser(BaseParser):
content = self.remove_images(content)
if self._remove_table_excess:
content = self.remove_table_excess(content)
+ if self._remove_directives:
+ content = self.remove_directives(content)
+ if self._remove_interpreters:
+ content = self.remove_interpreters(content)
rst_tups = self.rst_to_tups(content)
if self._remove_whitespaces_excess:
rst_tups = self.remove_whitespaces_excess(rst_tups)
From 3ab02ca9596111cc72bd440529b8150c5f811774 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=86=AF=E4=B8=8D=E6=B8=B8?=
<71683364+mefengl@users.noreply.github.com>
Date: Mon, 13 Feb 2023 11:00:34 +0800
Subject: [PATCH 04/12] feat: compatible with markdown
---
.gitignore | 1 +
scripts/ingest_rst_sphinx.py | 12 ++++++++++++
2 files changed, 13 insertions(+)
diff --git a/.gitignore b/.gitignore
index 8b394e9b..0003c21c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,3 +161,4 @@ frontend/*.sw?
application/vectors/
+**/inputs
diff --git a/scripts/ingest_rst_sphinx.py b/scripts/ingest_rst_sphinx.py
index 9d6c8ece..ecc71570 100644
--- a/scripts/ingest_rst_sphinx.py
+++ b/scripts/ingest_rst_sphinx.py
@@ -29,6 +29,18 @@ def convert_rst_to_txt(src_dir, dst_dir):
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
+ elif file.endswith(".md"):
+ # Rename the .md file to .rst file
+ src_file = os.path.join(root, file)
+ dst_file = os.path.join(root, file.replace(".md", ".rst"))
+ os.rename(src_file, dst_file)
+ # Convert the .rst file to .txt file using sphinx-build
+ args = f". -b text -D extensions=sphinx.ext.autodoc " \
+ f"-D master_doc={dst_file} " \
+ f"-D source_suffix=.rst " \
+ f"-C {dst_dir} "
+ sphinx_main(args.split())
+
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
From d5e5a5f59bbcd54b52fc582a4492e569f8b2a7ec Mon Sep 17 00:00:00 2001
From: Carson Yang
Date: Mon, 13 Feb 2023 19:20:15 +0800
Subject: [PATCH 05/12] Add Docker Image CI support
Signed-off-by: Carson Yang
---
.github/workflows/ci.yml | 46 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 46 insertions(+)
create mode 100644 .github/workflows/ci.yml
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 00000000..325e0f64
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,46 @@
+name: Build and push DocsGPT Docker image
+
+on:
+ workflow_dispatch:
+ push:
+ branches:
+ - main
+ pull_request:
+ branches: [ main ]
+
+jobs:
+ deploy:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: Set up QEMU
+ uses: docker/setup-qemu-action@v1
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v1
+
+ - name: Login to DockerHub
+ uses: docker/login-action@v2
+ with:
+ username: ${{ secrets.DOCKER_USERNAME }}
+ password: ${{ secrets.DOCKER_PASSWORD }}
+
+ - name: Login to ghcr.io
+ uses: docker/login-action@v2
+ with:
+ registry: ghcr.io
+ username: ${{ github.repository_owner }}
+ password: ${{ secrets.GHCR_TOKEN }}
+
+ # Runs a single command using the runners shell
+ - name: Build and push Docker images to docker.io and ghcr.io
+ uses: docker/build-push-action@v2
+ with:
+ file: './application/Dockerfile'
+ platforms: linux/amd64
+ context: ./application
+ push: true
+ tags: |
+ ${{ secrets.DOCKER_USERNAME }}/docsgpt:latest
+ ghcr.io/${{ github.repository_owner }}/docsgpt:latest
From 458f2a3ff3a9edaa78290f2e1ed0f5237f787fb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=86=AF=E4=B8=8D=E6=B8=B8?=
<71683364+mefengl@users.noreply.github.com>
Date: Tue, 14 Feb 2023 22:05:16 +0800
Subject: [PATCH 06/12] fix: restore index back when continue process
---
scripts/parser/open_ai_func.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py
index cbd947ee..433e5bcd 100644
--- a/scripts/parser/open_ai_func.py
+++ b/scripts/parser/open_ai_func.py
@@ -32,11 +32,14 @@ def call_openai_api(docs):
print("Saving progress")
print(f"stopped at {c1} out of {len(docs)}")
faiss.write_index(store.index, "docs.index")
+ store_index_bak = store.index
store.index = None
with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f)
print("Sleeping for 10 seconds and trying again")
time.sleep(10)
+ faiss.write_index(store_index_bak, "docs.index")
+ store.index = store_index_bak
store.add_texts([i.page_content], metadatas=[i.metadata])
c1 += 1
@@ -66,4 +69,4 @@ def get_user_permission(docs):
elif user_input == "":
call_openai_api(docs)
else:
- print("The API was not called. No money was spent.")
\ No newline at end of file
+ print("The API was not called. No money was spent.")
From 636783ca8a5843b3dfb3e399e14ff3634efb4962 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=86=AF=E4=B8=8D=E6=B8=B8?=
<71683364+mefengl@users.noreply.github.com>
Date: Tue, 14 Feb 2023 22:29:17 +0800
Subject: [PATCH 07/12] fix: avoid second error issue
---
scripts/parser/open_ai_func.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py
index 433e5bcd..70091324 100644
--- a/scripts/parser/open_ai_func.py
+++ b/scripts/parser/open_ai_func.py
@@ -36,8 +36,8 @@ def call_openai_api(docs):
store.index = None
with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f)
- print("Sleeping for 10 seconds and trying again")
- time.sleep(10)
+ print("Sleeping for 60 seconds and trying again")
+ time.sleep(60)
faiss.write_index(store_index_bak, "docs.index")
store.index = store_index_bak
store.add_texts([i.page_content], metadatas=[i.metadata])
From 0b42279709a67101ce58bcf782b41fbc61c7278b Mon Sep 17 00:00:00 2001
From: Alex
Date: Tue, 14 Feb 2023 14:32:32 +0000
Subject: [PATCH 08/12] Update ci.yml
---
.github/workflows/ci.yml | 2 --
1 file changed, 2 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 325e0f64..92cc718b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,8 +5,6 @@ on:
push:
branches:
- main
- pull_request:
- branches: [ main ]
jobs:
deploy:
From c67956da3794e527d6f997a525c959de298850fd Mon Sep 17 00:00:00 2001
From: Alex
Date: Tue, 14 Feb 2023 14:55:41 +0000
Subject: [PATCH 09/12] Create CONTRIBUTING.md
---
CONTRIBUTING.md | 38 ++++++++++++++++++++++++++++++++++++++
1 file changed, 38 insertions(+)
create mode 100644 CONTRIBUTING.md
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..0c10d0b4
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,38 @@
+# Welcome to DocsGPT Contributing guideline
+
+Thank you for choosing this project to contribute to, we are all very grateful!
+
+# We accept different types of contributions
+
+๐ฃ Discussions - where you can start a new topic or answer some questions
+
+๐ Issues - Is how we track tasks, sometimes its bugs that need fixing, sometimes its new features
+
+๐ ๏ธ Pull requests - Is how you can suggest changes to our repository, to work on existing issue or to add new features
+
+๐ Wiki - where we have our documentation
+
+
+## ๐ Issues and Pull requests
+
+We value contributions to our issues in form of discussion or suggestion, we recommend that you check out existing issues and our [Roadmap](https://github.com/orgs/arc53/projects/2)
+
+If you want to contribute by writing code there are few things that you should know before doing it:
+We have frontend (React, Vite) and Backend (python)
+
+### If you are looking to contribute to Frontend (โ๏ธReact, Vite):
+Current frontend is being migrated from /application to /frontend with a new design, so please contribute to the new on. Check out this [Milestone](https://github.com/arc53/DocsGPT/milestone/1) and its issues also [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1)
+Please try to follow guidelines
+
+
+### If you are looking to contribute to Backend (๐Python):
+Check out our issues, and contribute to /application or /scripts (ignore old ingest_rst.py ingest_rst_sphinx.py files, they will be deprecated soon)
+Currently we don't have any tests(which would be useful๐) but before submitting you PR make sure that after you ingested some test data its queryable
+
+### Workflow:
+Create a fork, make changes on your forked repository, submit changes in a form of pull request
+
+## Questions / collaboration
+Please join our [Discord](https://discord.gg/n5BX8dh8rU) don't hesitate, we are very friendly and welcoming to new contributors.
+
+# Thank you so much for considering to contribute to DocsGPT!๐
From 484764eac1695ada8cbf754ce1ba406cc9b2b815 Mon Sep 17 00:00:00 2001
From: Alex
Date: Tue, 14 Feb 2023 14:56:25 +0000
Subject: [PATCH 10/12] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 9b81171e..ccd14305 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ Copy .env_sample and create .env with your openai api token
## [Guides](https://github.com/arc53/docsgpt/wiki)
-
+## [Interested in contributing?](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md)
## [How to use any other documentation](https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation)
From af20c7298aa04a6b64e70347884795abb20a5ad6 Mon Sep 17 00:00:00 2001
From: Pavel
Date: Tue, 14 Feb 2023 19:37:07 +0400
Subject: [PATCH 11/12] new-ingest
Ingest with a CLI
---
scripts/ingest.py | 65 ++++++++++++++++++--------
scripts/{ => old}/ingest_rst.py | 0
scripts/{ => old}/ingest_rst_sphinx.py | 0
3 files changed, 46 insertions(+), 19 deletions(-)
rename scripts/{ => old}/ingest_rst.py (100%)
rename scripts/{ => old}/ingest_rst_sphinx.py (100%)
diff --git a/scripts/ingest.py b/scripts/ingest.py
index cebb6c33..3082cf4e 100644
--- a/scripts/ingest.py
+++ b/scripts/ingest.py
@@ -1,6 +1,9 @@
import sys
import nltk
import dotenv
+import typer
+
+from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -10,28 +13,52 @@ from parser.open_ai_func import call_openai_api, get_user_permission
dotenv.load_dotenv()
-#Specify your folder HERE
-directory_to_ingest = 'inputs'
+app = typer.Typer(add_completion=False)
-nltk.download('punkt')
-nltk.download('averaged_perceptron_tagger')
+nltk.download('punkt', quiet=True)
+nltk.download('averaged_perceptron_tagger', quiet=True)
#Splits all files in specified folder to documents
-raw_docs = SimpleDirectoryReader(input_dir=directory_to_ingest).load_data()
-raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
-# Here we split the documents, as needed, into smaller chunks.
-# We do this due to the context limits of the LLMs.
-text_splitter = RecursiveCharacterTextSplitter()
-docs = text_splitter.split_documents(raw_docs)
+@app.command()
+def ingest(directory: Optional[str] = typer.Option("inputs",
+ help="Path to the directory for index creation."),
+ files: Optional[List[str]] = typer.Option(None,
+ help="""File paths to use (Optional; overrides directory).
+ E.g. --files inputs/1.md --files inputs/2.md"""),
+ recursive: Optional[bool] = typer.Option(True,
+ help="Whether to recursively search in subdirectories."),
+ limit: Optional[int] = typer.Option(None,
+ help="Maximum number of files to read."),
+ formats: Optional[List[str]] = typer.Option([".rst", ".md"],
+ help="""List of required extensions (list with .)
+ Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""),
+ exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")):
-# Here we check for command line arguments for bot calls.
-# If no argument exists or the permission_bypass_flag argument is not '-y',
-# user permission is requested to call the API.
-if len(sys.argv) > 1:
- permission_bypass_flag = sys.argv[1]
- if permission_bypass_flag == '-y':
- call_openai_api(docs)
+ """
+ Creates index from specified location or files.
+ By default /inputs folder is used, .rst and .md are parsed.
+ """
+ raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=files, recursive=recursive,
+ required_exts=formats, num_files_limit=limit,
+ exclude_hidden=exclude).load_data()
+ raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
+ print(raw_docs)
+ # Here we split the documents, as needed, into smaller chunks.
+ # We do this due to the context limits of the LLMs.
+ text_splitter = RecursiveCharacterTextSplitter()
+ docs = text_splitter.split_documents(raw_docs)
+
+ # Here we check for command line arguments for bot calls.
+ # If no argument exists or the permission_bypass_flag argument is not '-y',
+ # user permission is requested to call the API.
+ if len(sys.argv) > 1:
+ permission_bypass_flag = sys.argv[1]
+ if permission_bypass_flag == '-y':
+ call_openai_api(docs)
+ else:
+ get_user_permission(docs)
else:
get_user_permission(docs)
-else:
- get_user_permission(docs)
\ No newline at end of file
+
+if __name__ == "__main__":
+ app()
diff --git a/scripts/ingest_rst.py b/scripts/old/ingest_rst.py
similarity index 100%
rename from scripts/ingest_rst.py
rename to scripts/old/ingest_rst.py
diff --git a/scripts/ingest_rst_sphinx.py b/scripts/old/ingest_rst_sphinx.py
similarity index 100%
rename from scripts/ingest_rst_sphinx.py
rename to scripts/old/ingest_rst_sphinx.py
From 7af703451918234623c30d7bf62df5957397b49e Mon Sep 17 00:00:00 2001
From: Pavel
Date: Tue, 14 Feb 2023 19:41:37 +0400
Subject: [PATCH 12/12] requirements
---
application/requirements.txt | 1 +
1 file changed, 1 insertion(+)
diff --git a/application/requirements.txt b/application/requirements.txt
index 9e8f73b1..7972f8c3 100644
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -60,6 +60,7 @@ tiktoken==0.1.2
tokenizers==0.13.2
tqdm==4.64.1
transformers==4.26.0
+typer==0.7.0
typing-inspect==0.8.0
typing_extensions==4.4.0
urllib3==1.26.14