From 205be538a33cc43fcf101a969743cd12ec190d99 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 12 Feb 2023 17:58:54 +0000 Subject: [PATCH 01/14] fix dbqa, with new chain type, also fix for doc export --- application/app.py | 19 +++++++++++++++---- scripts/parser/open_ai_func.py | 11 +++++++++-- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/application/app.py b/application/app.py index aa9089ed..c114c63b 100644 --- a/application/app.py +++ b/application/app.py @@ -5,8 +5,8 @@ import datetime from flask import Flask, request, render_template # os.environ["LANGCHAIN_HANDLER"] = "langchain" import faiss -from langchain import OpenAI -from langchain.chains import VectorDBQAWithSourcesChain +from langchain import OpenAI, VectorDBQA +from langchain.chains.question_answering import load_qa_chain from langchain.prompts import PromptTemplate import requests @@ -69,11 +69,22 @@ def api_answer(): c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template) # create a chain with the prompt template and the store - chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(openai_api_key=api_key, temperature=0), vectorstore=store, combine_prompt=c_prompt) + #chain = VectorDBQA.from_llm(llm=OpenAI(openai_api_key=api_key, temperature=0), vectorstore=store, combine_prompt=c_prompt) + # chain = VectorDBQA.from_chain_type(llm=OpenAI(openai_api_key=api_key, temperature=0), chain_type='map_reduce', + # vectorstore=store) + + qa_chain = load_qa_chain(OpenAI(openai_api_key=api_key, temperature=0), chain_type="map_reduce", + combine_prompt=c_prompt) + chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=store) + + + # fetch the answer - result = chain({"question": question}) + result = chain({"query": question}) + print(result) # some formatting for the frontend + result['answer'] = result['result'] result['answer'] = result['answer'].replace("\\n", "
") result['answer'] = result['answer'].replace("SOURCES:", "") # mock result diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index 00c57be9..cbd947ee 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -31,13 +31,20 @@ def call_openai_api(docs): print("Error on ", i) print("Saving progress") print(f"stopped at {c1} out of {len(docs)}") - store.save_local("outputs") + faiss.write_index(store.index, "docs.index") + store.index = None + with open("faiss_store.pkl", "wb") as f: + pickle.dump(store, f) print("Sleeping for 10 seconds and trying again") time.sleep(10) store.add_texts([i.page_content], metadatas=[i.metadata]) c1 += 1 - store.save_local("outputs") + + faiss.write_index(store.index, "docs.index") + store.index = None + with open("faiss_store.pkl", "wb") as f: + pickle.dump(store, f) def get_user_permission(docs): # Function to ask user permission to call the OpenAI api and spend their OpenAI funds. From b1a6ebffba593ae772344381b87ca173fe11e83a Mon Sep 17 00:00:00 2001 From: Pavel Date: Sun, 12 Feb 2023 22:29:40 +0400 Subject: [PATCH 02/14] Directives + Interpreted Some additional filters for rst parsing --- scripts/parser/file/rst_parser.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/scripts/parser/file/rst_parser.py b/scripts/parser/file/rst_parser.py index 0b887d4e..7c97b326 100644 --- a/scripts/parser/file/rst_parser.py +++ b/scripts/parser/file/rst_parser.py @@ -24,6 +24,8 @@ class RstParser(BaseParser): remove_hyperlinks: bool = True, remove_images: bool = True, remove_table_excess: bool = True, + remove_interpreters: bool = True, + remove_directives: bool = True, remove_whitespaces_excess: bool = True, #Be carefull with remove_characters_excess, might cause data loss remove_characters_excess: bool = True, @@ -34,6 +36,8 @@ class RstParser(BaseParser): self._remove_hyperlinks = remove_hyperlinks self._remove_images = remove_images self._remove_table_excess = remove_table_excess + self._remove_interpreters = remove_interpreters + self._remove_directives = remove_directives self._remove_whitespaces_excess = remove_whitespaces_excess self._remove_characters_excess = remove_characters_excess @@ -95,6 +99,18 @@ class RstParser(BaseParser): content = re.sub(pattern, r"\1", content) return content + def remove_directives(self, content: str) -> str: + """Removes reStructuredText Directives""" + pattern = r"`\.\.([^:]+)::" + content = re.sub(pattern, "", content) + return content + + def remove_interpreters(self, content: str) -> str: + """Removes reStructuredText Interpreted Text Roles""" + pattern = r":(\w+):" + content = re.sub(pattern, "", content) + return content + def remove_table_excess(self, content: str) -> str: """Pattern to remove grid table separators""" pattern = r"^\+[-]+\+[-]+\+$" @@ -129,6 +145,10 @@ class RstParser(BaseParser): content = self.remove_images(content) if self._remove_table_excess: content = self.remove_table_excess(content) + if self._remove_directives: + content = self.remove_directives(content) + if self._remove_interpreters: + content = self.remove_interpreters(content) rst_tups = self.rst_to_tups(content) if self._remove_whitespaces_excess: rst_tups = self.remove_whitespaces_excess(rst_tups) From 3ab02ca9596111cc72bd440529b8150c5f811774 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=86=AF=E4=B8=8D=E6=B8=B8?= <71683364+mefengl@users.noreply.github.com> Date: Mon, 13 Feb 2023 11:00:34 +0800 Subject: [PATCH 03/14] feat: compatible with markdown --- .gitignore | 1 + scripts/ingest_rst_sphinx.py | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/.gitignore b/.gitignore index 8b394e9b..0003c21c 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,4 @@ frontend/*.sw? application/vectors/ +**/inputs diff --git a/scripts/ingest_rst_sphinx.py b/scripts/ingest_rst_sphinx.py index 9d6c8ece..ecc71570 100644 --- a/scripts/ingest_rst_sphinx.py +++ b/scripts/ingest_rst_sphinx.py @@ -29,6 +29,18 @@ def convert_rst_to_txt(src_dir, dst_dir): f"-D source_suffix=.rst " \ f"-C {dst_dir} " sphinx_main(args.split()) + elif file.endswith(".md"): + # Rename the .md file to .rst file + src_file = os.path.join(root, file) + dst_file = os.path.join(root, file.replace(".md", ".rst")) + os.rename(src_file, dst_file) + # Convert the .rst file to .txt file using sphinx-build + args = f". -b text -D extensions=sphinx.ext.autodoc " \ + f"-D master_doc={dst_file} " \ + f"-D source_suffix=.rst " \ + f"-C {dst_dir} " + sphinx_main(args.split()) + def num_tokens_from_string(string: str, encoding_name: str) -> int: # Function to convert string to tokens and estimate user cost. From d5e5a5f59bbcd54b52fc582a4492e569f8b2a7ec Mon Sep 17 00:00:00 2001 From: Carson Yang Date: Mon, 13 Feb 2023 19:20:15 +0800 Subject: [PATCH 04/14] Add Docker Image CI support Signed-off-by: Carson Yang --- .github/workflows/ci.yml | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..325e0f64 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,46 @@ +name: Build and push DocsGPT Docker image + +on: + workflow_dispatch: + push: + branches: + - main + pull_request: + branches: [ main ] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Login to ghcr.io + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GHCR_TOKEN }} + + # Runs a single command using the runners shell + - name: Build and push Docker images to docker.io and ghcr.io + uses: docker/build-push-action@v2 + with: + file: './application/Dockerfile' + platforms: linux/amd64 + context: ./application + push: true + tags: | + ${{ secrets.DOCKER_USERNAME }}/docsgpt:latest + ghcr.io/${{ github.repository_owner }}/docsgpt:latest From 458f2a3ff3a9edaa78290f2e1ed0f5237f787fb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=86=AF=E4=B8=8D=E6=B8=B8?= <71683364+mefengl@users.noreply.github.com> Date: Tue, 14 Feb 2023 22:05:16 +0800 Subject: [PATCH 05/14] fix: restore index back when continue process --- scripts/parser/open_ai_func.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index cbd947ee..433e5bcd 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -32,11 +32,14 @@ def call_openai_api(docs): print("Saving progress") print(f"stopped at {c1} out of {len(docs)}") faiss.write_index(store.index, "docs.index") + store_index_bak = store.index store.index = None with open("faiss_store.pkl", "wb") as f: pickle.dump(store, f) print("Sleeping for 10 seconds and trying again") time.sleep(10) + faiss.write_index(store_index_bak, "docs.index") + store.index = store_index_bak store.add_texts([i.page_content], metadatas=[i.metadata]) c1 += 1 @@ -66,4 +69,4 @@ def get_user_permission(docs): elif user_input == "": call_openai_api(docs) else: - print("The API was not called. No money was spent.") \ No newline at end of file + print("The API was not called. No money was spent.") From 636783ca8a5843b3dfb3e399e14ff3634efb4962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=86=AF=E4=B8=8D=E6=B8=B8?= <71683364+mefengl@users.noreply.github.com> Date: Tue, 14 Feb 2023 22:29:17 +0800 Subject: [PATCH 06/14] fix: avoid second error issue --- scripts/parser/open_ai_func.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index 433e5bcd..70091324 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -36,8 +36,8 @@ def call_openai_api(docs): store.index = None with open("faiss_store.pkl", "wb") as f: pickle.dump(store, f) - print("Sleeping for 10 seconds and trying again") - time.sleep(10) + print("Sleeping for 60 seconds and trying again") + time.sleep(60) faiss.write_index(store_index_bak, "docs.index") store.index = store_index_bak store.add_texts([i.page_content], metadatas=[i.metadata]) From 0b42279709a67101ce58bcf782b41fbc61c7278b Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 14 Feb 2023 14:32:32 +0000 Subject: [PATCH 07/14] Update ci.yml --- .github/workflows/ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 325e0f64..92cc718b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,8 +5,6 @@ on: push: branches: - main - pull_request: - branches: [ main ] jobs: deploy: From c67956da3794e527d6f997a525c959de298850fd Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 14 Feb 2023 14:55:41 +0000 Subject: [PATCH 08/14] Create CONTRIBUTING.md --- CONTRIBUTING.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..0c10d0b4 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,38 @@ +# Welcome to DocsGPT Contributing guideline + +Thank you for choosing this project to contribute to, we are all very grateful! + +# We accept different types of contributions + +๐Ÿ“ฃ Discussions - where you can start a new topic or answer some questions + +๐Ÿž Issues - Is how we track tasks, sometimes its bugs that need fixing, sometimes its new features + +๐Ÿ› ๏ธ Pull requests - Is how you can suggest changes to our repository, to work on existing issue or to add new features + +๐Ÿ“š Wiki - where we have our documentation + + +## ๐Ÿž Issues and Pull requests + +We value contributions to our issues in form of discussion or suggestion, we recommend that you check out existing issues and our [Roadmap](https://github.com/orgs/arc53/projects/2) + +If you want to contribute by writing code there are few things that you should know before doing it: +We have frontend (React, Vite) and Backend (python) + +### If you are looking to contribute to Frontend (โš›๏ธReact, Vite): +Current frontend is being migrated from /application to /frontend with a new design, so please contribute to the new on. Check out this [Milestone](https://github.com/arc53/DocsGPT/milestone/1) and its issues also [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1) +Please try to follow guidelines + + +### If you are looking to contribute to Backend (๐ŸPython): +Check out our issues, and contribute to /application or /scripts (ignore old ingest_rst.py ingest_rst_sphinx.py files, they will be deprecated soon) +Currently we don't have any tests(which would be useful๐Ÿ˜‰) but before submitting you PR make sure that after you ingested some test data its queryable + +### Workflow: +Create a fork, make changes on your forked repository, submit changes in a form of pull request + +## Questions / collaboration +Please join our [Discord](https://discord.gg/n5BX8dh8rU) don't hesitate, we are very friendly and welcoming to new contributors. + +# Thank you so much for considering to contribute to DocsGPT!๐Ÿ™ From 484764eac1695ada8cbf754ce1ba406cc9b2b815 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 14 Feb 2023 14:56:25 +0000 Subject: [PATCH 09/14] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9b81171e..ccd14305 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Copy .env_sample and create .env with your openai api token ## [Guides](https://github.com/arc53/docsgpt/wiki) - +## [Interested in contributing?](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md) ## [How to use any other documentation](https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation) From af20c7298aa04a6b64e70347884795abb20a5ad6 Mon Sep 17 00:00:00 2001 From: Pavel Date: Tue, 14 Feb 2023 19:37:07 +0400 Subject: [PATCH 10/14] new-ingest Ingest with a CLI --- scripts/ingest.py | 65 ++++++++++++++++++-------- scripts/{ => old}/ingest_rst.py | 0 scripts/{ => old}/ingest_rst_sphinx.py | 0 3 files changed, 46 insertions(+), 19 deletions(-) rename scripts/{ => old}/ingest_rst.py (100%) rename scripts/{ => old}/ingest_rst_sphinx.py (100%) diff --git a/scripts/ingest.py b/scripts/ingest.py index cebb6c33..3082cf4e 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -1,6 +1,9 @@ import sys import nltk import dotenv +import typer + +from typing import List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -10,28 +13,52 @@ from parser.open_ai_func import call_openai_api, get_user_permission dotenv.load_dotenv() -#Specify your folder HERE -directory_to_ingest = 'inputs' +app = typer.Typer(add_completion=False) -nltk.download('punkt') -nltk.download('averaged_perceptron_tagger') +nltk.download('punkt', quiet=True) +nltk.download('averaged_perceptron_tagger', quiet=True) #Splits all files in specified folder to documents -raw_docs = SimpleDirectoryReader(input_dir=directory_to_ingest).load_data() -raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] -# Here we split the documents, as needed, into smaller chunks. -# We do this due to the context limits of the LLMs. -text_splitter = RecursiveCharacterTextSplitter() -docs = text_splitter.split_documents(raw_docs) +@app.command() +def ingest(directory: Optional[str] = typer.Option("inputs", + help="Path to the directory for index creation."), + files: Optional[List[str]] = typer.Option(None, + help="""File paths to use (Optional; overrides directory). + E.g. --files inputs/1.md --files inputs/2.md"""), + recursive: Optional[bool] = typer.Option(True, + help="Whether to recursively search in subdirectories."), + limit: Optional[int] = typer.Option(None, + help="Maximum number of files to read."), + formats: Optional[List[str]] = typer.Option([".rst", ".md"], + help="""List of required extensions (list with .) + Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""), + exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")): -# Here we check for command line arguments for bot calls. -# If no argument exists or the permission_bypass_flag argument is not '-y', -# user permission is requested to call the API. -if len(sys.argv) > 1: - permission_bypass_flag = sys.argv[1] - if permission_bypass_flag == '-y': - call_openai_api(docs) + """ + Creates index from specified location or files. + By default /inputs folder is used, .rst and .md are parsed. + """ + raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=files, recursive=recursive, + required_exts=formats, num_files_limit=limit, + exclude_hidden=exclude).load_data() + raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + print(raw_docs) + # Here we split the documents, as needed, into smaller chunks. + # We do this due to the context limits of the LLMs. + text_splitter = RecursiveCharacterTextSplitter() + docs = text_splitter.split_documents(raw_docs) + + # Here we check for command line arguments for bot calls. + # If no argument exists or the permission_bypass_flag argument is not '-y', + # user permission is requested to call the API. + if len(sys.argv) > 1: + permission_bypass_flag = sys.argv[1] + if permission_bypass_flag == '-y': + call_openai_api(docs) + else: + get_user_permission(docs) else: get_user_permission(docs) -else: - get_user_permission(docs) \ No newline at end of file + +if __name__ == "__main__": + app() diff --git a/scripts/ingest_rst.py b/scripts/old/ingest_rst.py similarity index 100% rename from scripts/ingest_rst.py rename to scripts/old/ingest_rst.py diff --git a/scripts/ingest_rst_sphinx.py b/scripts/old/ingest_rst_sphinx.py similarity index 100% rename from scripts/ingest_rst_sphinx.py rename to scripts/old/ingest_rst_sphinx.py From 7af703451918234623c30d7bf62df5957397b49e Mon Sep 17 00:00:00 2001 From: Pavel Date: Tue, 14 Feb 2023 19:41:37 +0400 Subject: [PATCH 11/14] requirements --- application/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/application/requirements.txt b/application/requirements.txt index 9e8f73b1..7972f8c3 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -60,6 +60,7 @@ tiktoken==0.1.2 tokenizers==0.13.2 tqdm==4.64.1 transformers==4.26.0 +typer==0.7.0 typing-inspect==0.8.0 typing_extensions==4.4.0 urllib3==1.26.14 From b83589a308a3b3dae3dd2342250bd5805a1e8e14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=86=AF=E4=B8=8D=E6=B8=B8?= <71683364+mefengl@users.noreply.github.com> Date: Wed, 15 Feb 2023 02:30:39 +0800 Subject: [PATCH 12/14] feat: add support for directory list example: `python ingest.py --dir inputs1 --dir another --dir ../inputs`, the outputs will be in `outputs/input_folder_name/` --- scripts/ingest.py | 66 +++++++++++++++++++++------------- scripts/parser/open_ai_func.py | 24 +++++++------ 2 files changed, 56 insertions(+), 34 deletions(-) diff --git a/scripts/ingest.py b/scripts/ingest.py index 3082cf4e..6286a377 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -1,3 +1,5 @@ +from collections import defaultdict +import os import sys import nltk import dotenv @@ -18,13 +20,16 @@ app = typer.Typer(add_completion=False) nltk.download('punkt', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) + #Splits all files in specified folder to documents @app.command() -def ingest(directory: Optional[str] = typer.Option("inputs", - help="Path to the directory for index creation."), - files: Optional[List[str]] = typer.Option(None, - help="""File paths to use (Optional; overrides directory). - E.g. --files inputs/1.md --files inputs/2.md"""), +def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False), + dir: Optional[List[str]] = typer.Option(["inputs"], + help="""List of paths to directory for index creation. + E.g. --dir inputs --dir inputs2"""), + file: Optional[List[str]] = typer.Option(None, + help="""File paths to use (Optional; overrides dir). + E.g. --file inputs/1.md --file inputs/2.md"""), recursive: Optional[bool] = typer.Option(True, help="Whether to recursively search in subdirectories."), limit: Optional[int] = typer.Option(None, @@ -38,27 +43,40 @@ def ingest(directory: Optional[str] = typer.Option("inputs", Creates index from specified location or files. By default /inputs folder is used, .rst and .md are parsed. """ - raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=files, recursive=recursive, - required_exts=formats, num_files_limit=limit, - exclude_hidden=exclude).load_data() - raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] - print(raw_docs) - # Here we split the documents, as needed, into smaller chunks. - # We do this due to the context limits of the LLMs. - text_splitter = RecursiveCharacterTextSplitter() - docs = text_splitter.split_documents(raw_docs) - # Here we check for command line arguments for bot calls. - # If no argument exists or the permission_bypass_flag argument is not '-y', - # user permission is requested to call the API. - if len(sys.argv) > 1: - permission_bypass_flag = sys.argv[1] - if permission_bypass_flag == '-y': - call_openai_api(docs) + def process_one_docs(directory, folder_name): + raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive, + required_exts=formats, num_files_limit=limit, + exclude_hidden=exclude).load_data() + raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + print(raw_docs) + # Here we split the documents, as needed, into smaller chunks. + # We do this due to the context limits of the LLMs. + text_splitter = RecursiveCharacterTextSplitter() + docs = text_splitter.split_documents(raw_docs) + + # Here we check for command line arguments for bot calls. + # If no argument exists or the yes is not True, then the + # user permission is requested to call the API. + if len(sys.argv) > 1: + if yes: + call_openai_api(docs, folder_name) + else: + get_user_permission(docs, folder_name) else: - get_user_permission(docs) - else: - get_user_permission(docs) + get_user_permission(docs, folder_name) + + folder_counts = defaultdict(int) + folder_names = [] + for dir_path in dir: + folder_name = os.path.basename(os.path.normpath(dir_path)) + folder_counts[folder_name] += 1 + if folder_counts[folder_name] > 1: + folder_name = f"{folder_name}_{folder_counts[folder_name]}" + folder_names.append(folder_name) + + for directory, folder_name in zip(dir, folder_names): + process_one_docs(directory, folder_name) if __name__ == "__main__": app() diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index 70091324..c396600c 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -1,3 +1,4 @@ +import os import faiss import pickle import tiktoken @@ -12,8 +13,13 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int: total_price = ((num_tokens/1000) * 0.0004) return num_tokens, total_price -def call_openai_api(docs): +def call_openai_api(docs, folder_name): # Function to create a vector store from the documents and save it to disk. + + # create output folder if it doesn't exist + if not os.path.exists(f"outputs/{folder_name}"): + os.makedirs(f"outputs/{folder_name}") + from tqdm import tqdm docs_test = [docs[0]] # remove the first element from docs @@ -31,25 +37,23 @@ def call_openai_api(docs): print("Error on ", i) print("Saving progress") print(f"stopped at {c1} out of {len(docs)}") - faiss.write_index(store.index, "docs.index") + faiss.write_index(store.index, f"outputs/{folder_name}/docs.index") store_index_bak = store.index store.index = None - with open("faiss_store.pkl", "wb") as f: + with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f: pickle.dump(store, f) print("Sleeping for 60 seconds and trying again") time.sleep(60) - faiss.write_index(store_index_bak, "docs.index") store.index = store_index_bak store.add_texts([i.page_content], metadatas=[i.metadata]) c1 += 1 - - faiss.write_index(store.index, "docs.index") + faiss.write_index(store.index, f"outputs/{folder_name}/docs.index") store.index = None - with open("faiss_store.pkl", "wb") as f: + with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f: pickle.dump(store, f) -def get_user_permission(docs): +def get_user_permission(docs, folder_name): # Function to ask user permission to call the OpenAI api and spend their OpenAI funds. # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. #docs_content = (" ".join(docs)) @@ -65,8 +69,8 @@ def get_user_permission(docs): #Here we check for user permission before calling the API. user_input = input("Price Okay? (Y/N) \n").lower() if user_input == "y": - call_openai_api(docs) + call_openai_api(docs, folder_name) elif user_input == "": - call_openai_api(docs) + call_openai_api(docs, folder_name) else: print("The API was not called. No money was spent.") From d57c7b02961747630f8ac86b2e0cb01e19276cbc Mon Sep 17 00:00:00 2001 From: Pavel Date: Wed, 15 Feb 2023 13:10:30 +0400 Subject: [PATCH 13/14] -y-description --- scripts/ingest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/ingest.py b/scripts/ingest.py index 6286a377..ca4b8b6b 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -23,7 +23,8 @@ nltk.download('averaged_perceptron_tagger', quiet=True) #Splits all files in specified folder to documents @app.command() -def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False), +def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, + help="Whether to skip price confirmation"), dir: Optional[List[str]] = typer.Option(["inputs"], help="""List of paths to directory for index creation. E.g. --dir inputs --dir inputs2"""), From aeac1864847ad75c07f804c0792387a90dd379ee Mon Sep 17 00:00:00 2001 From: EricGao888 Date: Wed, 15 Feb 2023 11:04:31 +0800 Subject: [PATCH 14/14] Add retry strategy to increase stability --- application/requirements.txt | 1 + scripts/parser/open_ai_func.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index 7972f8c3..878e1f24 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -45,6 +45,7 @@ pytz==2022.7.1 PyYAML==6.0 regex==2022.10.31 requests==2.28.2 +retry==0.9.2 six==1.16.0 snowballstemmer==2.2.0 Sphinx==6.1.3 diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index c396600c..472fdc4b 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -4,6 +4,7 @@ import pickle import tiktoken from langchain.vectorstores import FAISS from langchain.embeddings import OpenAIEmbeddings +from retry import retry def num_tokens_from_string(string: str, encoding_name: str) -> int: @@ -13,6 +14,10 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int: total_price = ((num_tokens/1000) * 0.0004) return num_tokens, total_price +@retry(tries=10, delay=60) +def store_add_texts_with_retry(store, i): + store.add_texts([i.page_content], metadatas=[i.metadata]) + def call_openai_api(docs, folder_name): # Function to create a vector store from the documents and save it to disk. @@ -31,7 +36,7 @@ def call_openai_api(docs, folder_name): for i in tqdm(docs, desc="Embedding ๐Ÿฆ–", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'): try: import time - store.add_texts([i.page_content], metadatas=[i.metadata]) + store_add_texts_with_retry(store, i) except Exception as e: print(e) print("Error on ", i)