From 9228005a7e3de5ae188b5cf2b8764cdf7ba2684f Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 12 Feb 2023 16:25:01 +0000 Subject: [PATCH 01/12] chunked embedding --- scripts/parser/open_ai_func.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index 500e4888..00c57be9 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -14,12 +14,30 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int: def call_openai_api(docs): # Function to create a vector store from the documents and save it to disk. - store = FAISS.from_documents(docs, OpenAIEmbeddings()) - faiss.write_index(store.index, "docs.index") - store.index = None + from tqdm import tqdm + docs_test = [docs[0]] + # remove the first element from docs + docs.pop(0) + # cut first n docs if you want to restart + #docs = docs[:n] + c1 = 0 + store = FAISS.from_documents(docs_test, OpenAIEmbeddings()) + for i in tqdm(docs, desc="Embedding ๐Ÿฆ–", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'): + try: + import time + store.add_texts([i.page_content], metadatas=[i.metadata]) + except Exception as e: + print(e) + print("Error on ", i) + print("Saving progress") + print(f"stopped at {c1} out of {len(docs)}") + store.save_local("outputs") + print("Sleeping for 10 seconds and trying again") + time.sleep(10) + store.add_texts([i.page_content], metadatas=[i.metadata]) + c1 += 1 - with open("faiss_store.pkl", "wb") as f: - pickle.dump(store, f) + store.save_local("outputs") def get_user_permission(docs): # Function to ask user permission to call the OpenAI api and spend their OpenAI funds. From 205be538a33cc43fcf101a969743cd12ec190d99 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 12 Feb 2023 17:58:54 +0000 Subject: [PATCH 02/12] fix dbqa, with new chain type, also fix for doc export --- application/app.py | 19 +++++++++++++++---- scripts/parser/open_ai_func.py | 11 +++++++++-- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/application/app.py b/application/app.py index aa9089ed..c114c63b 100644 --- a/application/app.py +++ b/application/app.py @@ -5,8 +5,8 @@ import datetime from flask import Flask, request, render_template # os.environ["LANGCHAIN_HANDLER"] = "langchain" import faiss -from langchain import OpenAI -from langchain.chains import VectorDBQAWithSourcesChain +from langchain import OpenAI, VectorDBQA +from langchain.chains.question_answering import load_qa_chain from langchain.prompts import PromptTemplate import requests @@ -69,11 +69,22 @@ def api_answer(): c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template) # create a chain with the prompt template and the store - chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(openai_api_key=api_key, temperature=0), vectorstore=store, combine_prompt=c_prompt) + #chain = VectorDBQA.from_llm(llm=OpenAI(openai_api_key=api_key, temperature=0), vectorstore=store, combine_prompt=c_prompt) + # chain = VectorDBQA.from_chain_type(llm=OpenAI(openai_api_key=api_key, temperature=0), chain_type='map_reduce', + # vectorstore=store) + + qa_chain = load_qa_chain(OpenAI(openai_api_key=api_key, temperature=0), chain_type="map_reduce", + combine_prompt=c_prompt) + chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=store) + + + # fetch the answer - result = chain({"question": question}) + result = chain({"query": question}) + print(result) # some formatting for the frontend + result['answer'] = result['result'] result['answer'] = result['answer'].replace("\\n", "
") result['answer'] = result['answer'].replace("SOURCES:", "") # mock result diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index 00c57be9..cbd947ee 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -31,13 +31,20 @@ def call_openai_api(docs): print("Error on ", i) print("Saving progress") print(f"stopped at {c1} out of {len(docs)}") - store.save_local("outputs") + faiss.write_index(store.index, "docs.index") + store.index = None + with open("faiss_store.pkl", "wb") as f: + pickle.dump(store, f) print("Sleeping for 10 seconds and trying again") time.sleep(10) store.add_texts([i.page_content], metadatas=[i.metadata]) c1 += 1 - store.save_local("outputs") + + faiss.write_index(store.index, "docs.index") + store.index = None + with open("faiss_store.pkl", "wb") as f: + pickle.dump(store, f) def get_user_permission(docs): # Function to ask user permission to call the OpenAI api and spend their OpenAI funds. From b1a6ebffba593ae772344381b87ca173fe11e83a Mon Sep 17 00:00:00 2001 From: Pavel Date: Sun, 12 Feb 2023 22:29:40 +0400 Subject: [PATCH 03/12] Directives + Interpreted Some additional filters for rst parsing --- scripts/parser/file/rst_parser.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/scripts/parser/file/rst_parser.py b/scripts/parser/file/rst_parser.py index 0b887d4e..7c97b326 100644 --- a/scripts/parser/file/rst_parser.py +++ b/scripts/parser/file/rst_parser.py @@ -24,6 +24,8 @@ class RstParser(BaseParser): remove_hyperlinks: bool = True, remove_images: bool = True, remove_table_excess: bool = True, + remove_interpreters: bool = True, + remove_directives: bool = True, remove_whitespaces_excess: bool = True, #Be carefull with remove_characters_excess, might cause data loss remove_characters_excess: bool = True, @@ -34,6 +36,8 @@ class RstParser(BaseParser): self._remove_hyperlinks = remove_hyperlinks self._remove_images = remove_images self._remove_table_excess = remove_table_excess + self._remove_interpreters = remove_interpreters + self._remove_directives = remove_directives self._remove_whitespaces_excess = remove_whitespaces_excess self._remove_characters_excess = remove_characters_excess @@ -95,6 +99,18 @@ class RstParser(BaseParser): content = re.sub(pattern, r"\1", content) return content + def remove_directives(self, content: str) -> str: + """Removes reStructuredText Directives""" + pattern = r"`\.\.([^:]+)::" + content = re.sub(pattern, "", content) + return content + + def remove_interpreters(self, content: str) -> str: + """Removes reStructuredText Interpreted Text Roles""" + pattern = r":(\w+):" + content = re.sub(pattern, "", content) + return content + def remove_table_excess(self, content: str) -> str: """Pattern to remove grid table separators""" pattern = r"^\+[-]+\+[-]+\+$" @@ -129,6 +145,10 @@ class RstParser(BaseParser): content = self.remove_images(content) if self._remove_table_excess: content = self.remove_table_excess(content) + if self._remove_directives: + content = self.remove_directives(content) + if self._remove_interpreters: + content = self.remove_interpreters(content) rst_tups = self.rst_to_tups(content) if self._remove_whitespaces_excess: rst_tups = self.remove_whitespaces_excess(rst_tups) From 3ab02ca9596111cc72bd440529b8150c5f811774 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=86=AF=E4=B8=8D=E6=B8=B8?= <71683364+mefengl@users.noreply.github.com> Date: Mon, 13 Feb 2023 11:00:34 +0800 Subject: [PATCH 04/12] feat: compatible with markdown --- .gitignore | 1 + scripts/ingest_rst_sphinx.py | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/.gitignore b/.gitignore index 8b394e9b..0003c21c 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,4 @@ frontend/*.sw? application/vectors/ +**/inputs diff --git a/scripts/ingest_rst_sphinx.py b/scripts/ingest_rst_sphinx.py index 9d6c8ece..ecc71570 100644 --- a/scripts/ingest_rst_sphinx.py +++ b/scripts/ingest_rst_sphinx.py @@ -29,6 +29,18 @@ def convert_rst_to_txt(src_dir, dst_dir): f"-D source_suffix=.rst " \ f"-C {dst_dir} " sphinx_main(args.split()) + elif file.endswith(".md"): + # Rename the .md file to .rst file + src_file = os.path.join(root, file) + dst_file = os.path.join(root, file.replace(".md", ".rst")) + os.rename(src_file, dst_file) + # Convert the .rst file to .txt file using sphinx-build + args = f". -b text -D extensions=sphinx.ext.autodoc " \ + f"-D master_doc={dst_file} " \ + f"-D source_suffix=.rst " \ + f"-C {dst_dir} " + sphinx_main(args.split()) + def num_tokens_from_string(string: str, encoding_name: str) -> int: # Function to convert string to tokens and estimate user cost. From d5e5a5f59bbcd54b52fc582a4492e569f8b2a7ec Mon Sep 17 00:00:00 2001 From: Carson Yang Date: Mon, 13 Feb 2023 19:20:15 +0800 Subject: [PATCH 05/12] Add Docker Image CI support Signed-off-by: Carson Yang --- .github/workflows/ci.yml | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..325e0f64 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,46 @@ +name: Build and push DocsGPT Docker image + +on: + workflow_dispatch: + push: + branches: + - main + pull_request: + branches: [ main ] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Login to ghcr.io + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GHCR_TOKEN }} + + # Runs a single command using the runners shell + - name: Build and push Docker images to docker.io and ghcr.io + uses: docker/build-push-action@v2 + with: + file: './application/Dockerfile' + platforms: linux/amd64 + context: ./application + push: true + tags: | + ${{ secrets.DOCKER_USERNAME }}/docsgpt:latest + ghcr.io/${{ github.repository_owner }}/docsgpt:latest From 458f2a3ff3a9edaa78290f2e1ed0f5237f787fb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=86=AF=E4=B8=8D=E6=B8=B8?= <71683364+mefengl@users.noreply.github.com> Date: Tue, 14 Feb 2023 22:05:16 +0800 Subject: [PATCH 06/12] fix: restore index back when continue process --- scripts/parser/open_ai_func.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index cbd947ee..433e5bcd 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -32,11 +32,14 @@ def call_openai_api(docs): print("Saving progress") print(f"stopped at {c1} out of {len(docs)}") faiss.write_index(store.index, "docs.index") + store_index_bak = store.index store.index = None with open("faiss_store.pkl", "wb") as f: pickle.dump(store, f) print("Sleeping for 10 seconds and trying again") time.sleep(10) + faiss.write_index(store_index_bak, "docs.index") + store.index = store_index_bak store.add_texts([i.page_content], metadatas=[i.metadata]) c1 += 1 @@ -66,4 +69,4 @@ def get_user_permission(docs): elif user_input == "": call_openai_api(docs) else: - print("The API was not called. No money was spent.") \ No newline at end of file + print("The API was not called. No money was spent.") From 636783ca8a5843b3dfb3e399e14ff3634efb4962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=86=AF=E4=B8=8D=E6=B8=B8?= <71683364+mefengl@users.noreply.github.com> Date: Tue, 14 Feb 2023 22:29:17 +0800 Subject: [PATCH 07/12] fix: avoid second error issue --- scripts/parser/open_ai_func.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index 433e5bcd..70091324 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -36,8 +36,8 @@ def call_openai_api(docs): store.index = None with open("faiss_store.pkl", "wb") as f: pickle.dump(store, f) - print("Sleeping for 10 seconds and trying again") - time.sleep(10) + print("Sleeping for 60 seconds and trying again") + time.sleep(60) faiss.write_index(store_index_bak, "docs.index") store.index = store_index_bak store.add_texts([i.page_content], metadatas=[i.metadata]) From 0b42279709a67101ce58bcf782b41fbc61c7278b Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 14 Feb 2023 14:32:32 +0000 Subject: [PATCH 08/12] Update ci.yml --- .github/workflows/ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 325e0f64..92cc718b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,8 +5,6 @@ on: push: branches: - main - pull_request: - branches: [ main ] jobs: deploy: From c67956da3794e527d6f997a525c959de298850fd Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 14 Feb 2023 14:55:41 +0000 Subject: [PATCH 09/12] Create CONTRIBUTING.md --- CONTRIBUTING.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..0c10d0b4 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,38 @@ +# Welcome to DocsGPT Contributing guideline + +Thank you for choosing this project to contribute to, we are all very grateful! + +# We accept different types of contributions + +๐Ÿ“ฃ Discussions - where you can start a new topic or answer some questions + +๐Ÿž Issues - Is how we track tasks, sometimes its bugs that need fixing, sometimes its new features + +๐Ÿ› ๏ธ Pull requests - Is how you can suggest changes to our repository, to work on existing issue or to add new features + +๐Ÿ“š Wiki - where we have our documentation + + +## ๐Ÿž Issues and Pull requests + +We value contributions to our issues in form of discussion or suggestion, we recommend that you check out existing issues and our [Roadmap](https://github.com/orgs/arc53/projects/2) + +If you want to contribute by writing code there are few things that you should know before doing it: +We have frontend (React, Vite) and Backend (python) + +### If you are looking to contribute to Frontend (โš›๏ธReact, Vite): +Current frontend is being migrated from /application to /frontend with a new design, so please contribute to the new on. Check out this [Milestone](https://github.com/arc53/DocsGPT/milestone/1) and its issues also [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1) +Please try to follow guidelines + + +### If you are looking to contribute to Backend (๐ŸPython): +Check out our issues, and contribute to /application or /scripts (ignore old ingest_rst.py ingest_rst_sphinx.py files, they will be deprecated soon) +Currently we don't have any tests(which would be useful๐Ÿ˜‰) but before submitting you PR make sure that after you ingested some test data its queryable + +### Workflow: +Create a fork, make changes on your forked repository, submit changes in a form of pull request + +## Questions / collaboration +Please join our [Discord](https://discord.gg/n5BX8dh8rU) don't hesitate, we are very friendly and welcoming to new contributors. + +# Thank you so much for considering to contribute to DocsGPT!๐Ÿ™ From 484764eac1695ada8cbf754ce1ba406cc9b2b815 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 14 Feb 2023 14:56:25 +0000 Subject: [PATCH 10/12] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9b81171e..ccd14305 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Copy .env_sample and create .env with your openai api token ## [Guides](https://github.com/arc53/docsgpt/wiki) - +## [Interested in contributing?](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md) ## [How to use any other documentation](https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation) From af20c7298aa04a6b64e70347884795abb20a5ad6 Mon Sep 17 00:00:00 2001 From: Pavel Date: Tue, 14 Feb 2023 19:37:07 +0400 Subject: [PATCH 11/12] new-ingest Ingest with a CLI --- scripts/ingest.py | 65 ++++++++++++++++++-------- scripts/{ => old}/ingest_rst.py | 0 scripts/{ => old}/ingest_rst_sphinx.py | 0 3 files changed, 46 insertions(+), 19 deletions(-) rename scripts/{ => old}/ingest_rst.py (100%) rename scripts/{ => old}/ingest_rst_sphinx.py (100%) diff --git a/scripts/ingest.py b/scripts/ingest.py index cebb6c33..3082cf4e 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -1,6 +1,9 @@ import sys import nltk import dotenv +import typer + +from typing import List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -10,28 +13,52 @@ from parser.open_ai_func import call_openai_api, get_user_permission dotenv.load_dotenv() -#Specify your folder HERE -directory_to_ingest = 'inputs' +app = typer.Typer(add_completion=False) -nltk.download('punkt') -nltk.download('averaged_perceptron_tagger') +nltk.download('punkt', quiet=True) +nltk.download('averaged_perceptron_tagger', quiet=True) #Splits all files in specified folder to documents -raw_docs = SimpleDirectoryReader(input_dir=directory_to_ingest).load_data() -raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] -# Here we split the documents, as needed, into smaller chunks. -# We do this due to the context limits of the LLMs. -text_splitter = RecursiveCharacterTextSplitter() -docs = text_splitter.split_documents(raw_docs) +@app.command() +def ingest(directory: Optional[str] = typer.Option("inputs", + help="Path to the directory for index creation."), + files: Optional[List[str]] = typer.Option(None, + help="""File paths to use (Optional; overrides directory). + E.g. --files inputs/1.md --files inputs/2.md"""), + recursive: Optional[bool] = typer.Option(True, + help="Whether to recursively search in subdirectories."), + limit: Optional[int] = typer.Option(None, + help="Maximum number of files to read."), + formats: Optional[List[str]] = typer.Option([".rst", ".md"], + help="""List of required extensions (list with .) + Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""), + exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")): -# Here we check for command line arguments for bot calls. -# If no argument exists or the permission_bypass_flag argument is not '-y', -# user permission is requested to call the API. -if len(sys.argv) > 1: - permission_bypass_flag = sys.argv[1] - if permission_bypass_flag == '-y': - call_openai_api(docs) + """ + Creates index from specified location or files. + By default /inputs folder is used, .rst and .md are parsed. + """ + raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=files, recursive=recursive, + required_exts=formats, num_files_limit=limit, + exclude_hidden=exclude).load_data() + raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + print(raw_docs) + # Here we split the documents, as needed, into smaller chunks. + # We do this due to the context limits of the LLMs. + text_splitter = RecursiveCharacterTextSplitter() + docs = text_splitter.split_documents(raw_docs) + + # Here we check for command line arguments for bot calls. + # If no argument exists or the permission_bypass_flag argument is not '-y', + # user permission is requested to call the API. + if len(sys.argv) > 1: + permission_bypass_flag = sys.argv[1] + if permission_bypass_flag == '-y': + call_openai_api(docs) + else: + get_user_permission(docs) else: get_user_permission(docs) -else: - get_user_permission(docs) \ No newline at end of file + +if __name__ == "__main__": + app() diff --git a/scripts/ingest_rst.py b/scripts/old/ingest_rst.py similarity index 100% rename from scripts/ingest_rst.py rename to scripts/old/ingest_rst.py diff --git a/scripts/ingest_rst_sphinx.py b/scripts/old/ingest_rst_sphinx.py similarity index 100% rename from scripts/ingest_rst_sphinx.py rename to scripts/old/ingest_rst_sphinx.py From 7af703451918234623c30d7bf62df5957397b49e Mon Sep 17 00:00:00 2001 From: Pavel Date: Tue, 14 Feb 2023 19:41:37 +0400 Subject: [PATCH 12/12] requirements --- application/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/application/requirements.txt b/application/requirements.txt index 9e8f73b1..7972f8c3 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -60,6 +60,7 @@ tiktoken==0.1.2 tokenizers==0.13.2 tqdm==4.64.1 transformers==4.26.0 +typer==0.7.0 typing-inspect==0.8.0 typing_extensions==4.4.0 urllib3==1.26.14