diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..92cc718b --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,44 @@ +name: Build and push DocsGPT Docker image + +on: + workflow_dispatch: + push: + branches: + - main + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Login to ghcr.io + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GHCR_TOKEN }} + + # Runs a single command using the runners shell + - name: Build and push Docker images to docker.io and ghcr.io + uses: docker/build-push-action@v2 + with: + file: './application/Dockerfile' + platforms: linux/amd64 + context: ./application + push: true + tags: | + ${{ secrets.DOCKER_USERNAME }}/docsgpt:latest + ghcr.io/${{ github.repository_owner }}/docsgpt:latest diff --git a/.gitignore b/.gitignore index 8b394e9b..0003c21c 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,4 @@ frontend/*.sw? application/vectors/ +**/inputs diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..0c10d0b4 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,38 @@ +# Welcome to DocsGPT Contributing guideline + +Thank you for choosing this project to contribute to, we are all very grateful! + +# We accept different types of contributions + +๐Ÿ“ฃ Discussions - where you can start a new topic or answer some questions + +๐Ÿž Issues - Is how we track tasks, sometimes its bugs that need fixing, sometimes its new features + +๐Ÿ› ๏ธ Pull requests - Is how you can suggest changes to our repository, to work on existing issue or to add new features + +๐Ÿ“š Wiki - where we have our documentation + + +## ๐Ÿž Issues and Pull requests + +We value contributions to our issues in form of discussion or suggestion, we recommend that you check out existing issues and our [Roadmap](https://github.com/orgs/arc53/projects/2) + +If you want to contribute by writing code there are few things that you should know before doing it: +We have frontend (React, Vite) and Backend (python) + +### If you are looking to contribute to Frontend (โš›๏ธReact, Vite): +Current frontend is being migrated from /application to /frontend with a new design, so please contribute to the new on. Check out this [Milestone](https://github.com/arc53/DocsGPT/milestone/1) and its issues also [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1) +Please try to follow guidelines + + +### If you are looking to contribute to Backend (๐ŸPython): +Check out our issues, and contribute to /application or /scripts (ignore old ingest_rst.py ingest_rst_sphinx.py files, they will be deprecated soon) +Currently we don't have any tests(which would be useful๐Ÿ˜‰) but before submitting you PR make sure that after you ingested some test data its queryable + +### Workflow: +Create a fork, make changes on your forked repository, submit changes in a form of pull request + +## Questions / collaboration +Please join our [Discord](https://discord.gg/n5BX8dh8rU) don't hesitate, we are very friendly and welcoming to new contributors. + +# Thank you so much for considering to contribute to DocsGPT!๐Ÿ™ diff --git a/README.md b/README.md index 9b81171e..ccd14305 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Copy .env_sample and create .env with your openai api token ## [Guides](https://github.com/arc53/docsgpt/wiki) - +## [Interested in contributing?](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md) ## [How to use any other documentation](https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation) diff --git a/application/app.py b/application/app.py index 41549568..e2bbf962 100644 --- a/application/app.py +++ b/application/app.py @@ -5,6 +5,7 @@ import datetime from flask import Flask, request, render_template # os.environ["LANGCHAIN_HANDLER"] = "langchain" import faiss + from langchain import FAISS from langchain import OpenAI, VectorDBQA, HuggingFaceHub, Cohere from langchain.chains.question_answering import load_qa_chain @@ -77,6 +78,7 @@ def api_answer(): c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template) # create a chain with the prompt template and the store + #llm = ManifestWrapper(client=manifest, llm_kwargs={"temperature": 0.001, "max_tokens": 2048}) llm = OpenAI(openai_api_key=api_key, temperature=0) #llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key) diff --git a/application/requirements.txt b/application/requirements.txt index 9e8f73b1..878e1f24 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -45,6 +45,7 @@ pytz==2022.7.1 PyYAML==6.0 regex==2022.10.31 requests==2.28.2 +retry==0.9.2 six==1.16.0 snowballstemmer==2.2.0 Sphinx==6.1.3 @@ -60,6 +61,7 @@ tiktoken==0.1.2 tokenizers==0.13.2 tqdm==4.64.1 transformers==4.26.0 +typer==0.7.0 typing-inspect==0.8.0 typing_extensions==4.4.0 urllib3==1.26.14 diff --git a/scripts/ingest.py b/scripts/ingest.py index cebb6c33..ca4b8b6b 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -1,6 +1,11 @@ +from collections import defaultdict +import os import sys import nltk import dotenv +import typer + +from typing import List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -10,28 +15,69 @@ from parser.open_ai_func import call_openai_api, get_user_permission dotenv.load_dotenv() -#Specify your folder HERE -directory_to_ingest = 'inputs' +app = typer.Typer(add_completion=False) + +nltk.download('punkt', quiet=True) +nltk.download('averaged_perceptron_tagger', quiet=True) -nltk.download('punkt') -nltk.download('averaged_perceptron_tagger') #Splits all files in specified folder to documents -raw_docs = SimpleDirectoryReader(input_dir=directory_to_ingest).load_data() -raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] -# Here we split the documents, as needed, into smaller chunks. -# We do this due to the context limits of the LLMs. -text_splitter = RecursiveCharacterTextSplitter() -docs = text_splitter.split_documents(raw_docs) +@app.command() +def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, + help="Whether to skip price confirmation"), + dir: Optional[List[str]] = typer.Option(["inputs"], + help="""List of paths to directory for index creation. + E.g. --dir inputs --dir inputs2"""), + file: Optional[List[str]] = typer.Option(None, + help="""File paths to use (Optional; overrides dir). + E.g. --file inputs/1.md --file inputs/2.md"""), + recursive: Optional[bool] = typer.Option(True, + help="Whether to recursively search in subdirectories."), + limit: Optional[int] = typer.Option(None, + help="Maximum number of files to read."), + formats: Optional[List[str]] = typer.Option([".rst", ".md"], + help="""List of required extensions (list with .) + Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""), + exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")): -# Here we check for command line arguments for bot calls. -# If no argument exists or the permission_bypass_flag argument is not '-y', -# user permission is requested to call the API. -if len(sys.argv) > 1: - permission_bypass_flag = sys.argv[1] - if permission_bypass_flag == '-y': - call_openai_api(docs) - else: - get_user_permission(docs) -else: - get_user_permission(docs) \ No newline at end of file + """ + Creates index from specified location or files. + By default /inputs folder is used, .rst and .md are parsed. + """ + + def process_one_docs(directory, folder_name): + raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive, + required_exts=formats, num_files_limit=limit, + exclude_hidden=exclude).load_data() + raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + print(raw_docs) + # Here we split the documents, as needed, into smaller chunks. + # We do this due to the context limits of the LLMs. + text_splitter = RecursiveCharacterTextSplitter() + docs = text_splitter.split_documents(raw_docs) + + # Here we check for command line arguments for bot calls. + # If no argument exists or the yes is not True, then the + # user permission is requested to call the API. + if len(sys.argv) > 1: + if yes: + call_openai_api(docs, folder_name) + else: + get_user_permission(docs, folder_name) + else: + get_user_permission(docs, folder_name) + + folder_counts = defaultdict(int) + folder_names = [] + for dir_path in dir: + folder_name = os.path.basename(os.path.normpath(dir_path)) + folder_counts[folder_name] += 1 + if folder_counts[folder_name] > 1: + folder_name = f"{folder_name}_{folder_counts[folder_name]}" + folder_names.append(folder_name) + + for directory, folder_name in zip(dir, folder_names): + process_one_docs(directory, folder_name) + +if __name__ == "__main__": + app() diff --git a/scripts/ingest_rst.py b/scripts/old/ingest_rst.py similarity index 100% rename from scripts/ingest_rst.py rename to scripts/old/ingest_rst.py diff --git a/scripts/ingest_rst_sphinx.py b/scripts/old/ingest_rst_sphinx.py similarity index 88% rename from scripts/ingest_rst_sphinx.py rename to scripts/old/ingest_rst_sphinx.py index 9d6c8ece..ecc71570 100644 --- a/scripts/ingest_rst_sphinx.py +++ b/scripts/old/ingest_rst_sphinx.py @@ -29,6 +29,18 @@ def convert_rst_to_txt(src_dir, dst_dir): f"-D source_suffix=.rst " \ f"-C {dst_dir} " sphinx_main(args.split()) + elif file.endswith(".md"): + # Rename the .md file to .rst file + src_file = os.path.join(root, file) + dst_file = os.path.join(root, file.replace(".md", ".rst")) + os.rename(src_file, dst_file) + # Convert the .rst file to .txt file using sphinx-build + args = f". -b text -D extensions=sphinx.ext.autodoc " \ + f"-D master_doc={dst_file} " \ + f"-D source_suffix=.rst " \ + f"-C {dst_dir} " + sphinx_main(args.split()) + def num_tokens_from_string(string: str, encoding_name: str) -> int: # Function to convert string to tokens and estimate user cost. diff --git a/scripts/parser/file/rst_parser.py b/scripts/parser/file/rst_parser.py index 0b887d4e..7c97b326 100644 --- a/scripts/parser/file/rst_parser.py +++ b/scripts/parser/file/rst_parser.py @@ -24,6 +24,8 @@ class RstParser(BaseParser): remove_hyperlinks: bool = True, remove_images: bool = True, remove_table_excess: bool = True, + remove_interpreters: bool = True, + remove_directives: bool = True, remove_whitespaces_excess: bool = True, #Be carefull with remove_characters_excess, might cause data loss remove_characters_excess: bool = True, @@ -34,6 +36,8 @@ class RstParser(BaseParser): self._remove_hyperlinks = remove_hyperlinks self._remove_images = remove_images self._remove_table_excess = remove_table_excess + self._remove_interpreters = remove_interpreters + self._remove_directives = remove_directives self._remove_whitespaces_excess = remove_whitespaces_excess self._remove_characters_excess = remove_characters_excess @@ -95,6 +99,18 @@ class RstParser(BaseParser): content = re.sub(pattern, r"\1", content) return content + def remove_directives(self, content: str) -> str: + """Removes reStructuredText Directives""" + pattern = r"`\.\.([^:]+)::" + content = re.sub(pattern, "", content) + return content + + def remove_interpreters(self, content: str) -> str: + """Removes reStructuredText Interpreted Text Roles""" + pattern = r":(\w+):" + content = re.sub(pattern, "", content) + return content + def remove_table_excess(self, content: str) -> str: """Pattern to remove grid table separators""" pattern = r"^\+[-]+\+[-]+\+$" @@ -129,6 +145,10 @@ class RstParser(BaseParser): content = self.remove_images(content) if self._remove_table_excess: content = self.remove_table_excess(content) + if self._remove_directives: + content = self.remove_directives(content) + if self._remove_interpreters: + content = self.remove_interpreters(content) rst_tups = self.rst_to_tups(content) if self._remove_whitespaces_excess: rst_tups = self.remove_whitespaces_excess(rst_tups) diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index 2fe05170..a4d023c1 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -1,10 +1,15 @@ +import os import faiss import pickle import tiktoken from langchain.vectorstores import FAISS from langchain.embeddings import OpenAIEmbeddings + #from langchain.embeddings import HuggingFaceEmbeddings +from retry import retry + + def num_tokens_from_string(string: str, encoding_name: str) -> int: # Function to convert string to tokens and estimate user cost. @@ -13,8 +18,17 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int: total_price = ((num_tokens/1000) * 0.0004) return num_tokens, total_price -def call_openai_api(docs): +@retry(tries=10, delay=60) +def store_add_texts_with_retry(store, i): + store.add_texts([i.page_content], metadatas=[i.metadata]) + +def call_openai_api(docs, folder_name): # Function to create a vector store from the documents and save it to disk. + + # create output folder if it doesn't exist + if not os.path.exists(f"outputs/{folder_name}"): + os.makedirs(f"outputs/{folder_name}") + from tqdm import tqdm docs_test = [docs[0]] # remove the first element from docs @@ -31,21 +45,29 @@ def call_openai_api(docs): for i in tqdm(docs, desc="Embedding ๐Ÿฆ–", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'): try: import time - store.add_texts([i.page_content], metadatas=[i.metadata]) + store_add_texts_with_retry(store, i) except Exception as e: print(e) print("Error on ", i) print("Saving progress") print(f"stopped at {c1} out of {len(docs)}") - store.save_local("outputs") - print("Sleeping for 10 seconds and trying again") - time.sleep(10) + faiss.write_index(store.index, f"outputs/{folder_name}/docs.index") + store_index_bak = store.index + store.index = None + with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f: + pickle.dump(store, f) + print("Sleeping for 60 seconds and trying again") + time.sleep(60) + store.index = store_index_bak store.add_texts([i.page_content], metadatas=[i.metadata]) c1 += 1 - store.save_local("outputs") + faiss.write_index(store.index, f"outputs/{folder_name}/docs.index") + store.index = None + with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f: + pickle.dump(store, f) -def get_user_permission(docs): +def get_user_permission(docs, folder_name): # Function to ask user permission to call the OpenAI api and spend their OpenAI funds. # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. #docs_content = (" ".join(docs)) @@ -61,8 +83,8 @@ def get_user_permission(docs): #Here we check for user permission before calling the API. user_input = input("Price Okay? (Y/N) \n").lower() if user_input == "y": - call_openai_api(docs) + call_openai_api(docs, folder_name) elif user_input == "": - call_openai_api(docs) + call_openai_api(docs, folder_name) else: - print("The API was not called. No money was spent.") \ No newline at end of file + print("The API was not called. No money was spent.")