diff --git a/application/requirements.txt b/application/requirements.txt index b26c87b4..e46c91fc 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -1,11 +1,15 @@ aiohttp==3.8.3 aiosignal==1.3.1 +alabaster==0.7.13 async-timeout==4.0.2 attrs==22.2.0 +Babel==2.11.0 blobfile==2.0.1 +certifi==2022.12.7 charset-normalizer==2.1.1 click==8.1.3 dataclasses-json==0.5.7 +docutils==0.19 faiss-cpu==1.7.3 filelock==3.9.0 Flask==2.2.2 @@ -13,6 +17,7 @@ frozenlist==1.3.3 greenlet==2.0.2 huggingface-hub==0.12.0 idna==3.4 +imagesize==1.4.1 itsdangerous==2.1.2 Jinja2==3.1.2 langchain==0.0.76 @@ -27,10 +32,20 @@ openai==0.26.4 packaging==23.0 pycryptodomex==3.17 pydantic==1.10.4 +Pygments==2.14.0 python-dotenv==0.21.1 +pytz==2022.7.1 PyYAML==6.0 regex==2022.10.31 requests==2.28.2 +snowballstemmer==2.2.0 +Sphinx==6.1.3 +sphinxcontrib-applehelp==1.0.4 +sphinxcontrib-devhelp==1.0.2 +sphinxcontrib-htmlhelp==2.0.1 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-serializinghtml==1.1.5 SQLAlchemy==1.4.46 tiktoken==0.1.2 tokenizers==0.13.2 diff --git a/scripts/ingest_rst.py b/scripts/ingest_rst.py index 03d090e8..38881daf 100644 --- a/scripts/ingest_rst.py +++ b/scripts/ingest_rst.py @@ -5,13 +5,12 @@ from langchain.vectorstores import FAISS from langchain.embeddings import OpenAIEmbeddings import pickle import dotenv -import os dotenv.load_dotenv() # Here we load in the data in the format that Notion exports it in. -ps = list(Path("pandasdocs/").glob("**/*.rst")) +ps = list(Path("scikit-learn").glob("**/*.rst")) # parse all child directories data = [] @@ -37,4 +36,4 @@ store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas) faiss.write_index(store.index, "docs.index") store.index = None with open("faiss_store.pkl", "wb") as f: - pickle.dump(store, f) + pickle.dump(store, f) \ No newline at end of file diff --git a/scripts/ingest_rst_sphinx.py b/scripts/ingest_rst_sphinx.py new file mode 100644 index 00000000..183ccac9 --- /dev/null +++ b/scripts/ingest_rst_sphinx.py @@ -0,0 +1,71 @@ +import os +import pickle +import dotenv +import faiss +import shutil +from pathlib import Path +from langchain.vectorstores import FAISS +from langchain.embeddings import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter +from sphinx.cmd.build import main as sphinx_main + + +def convert_rst_to_txt(src_dir, dst_dir): + # Check if the source directory exists + if not os.path.exists(src_dir): + raise Exception("Source directory does not exist") + # Walk through the source directory + for root, dirs, files in os.walk(src_dir): + for file in files: + # Check if the file has .rst extension + if file.endswith(".rst"): + # Construct the full path of the file + src_file = os.path.join(root, file.replace(".rst", "")) + # Convert the .rst file to .txt file using sphinx-build + args = f". -b text -D extensions=sphinx.ext.autodoc " \ + f"-D master_doc={src_file} " \ + f"-D source_suffix=.rst " \ + f"-C {dst_dir} " + sphinx_main(args.split()) + +#Load .env file +dotenv.load_dotenv() + +#Directory to vector +src_dir = "scikit-learn" +dst_dir = "tmp" + +convert_rst_to_txt(src_dir, dst_dir) + +# Here we load in the data in the format that Notion exports it in. +ps = list(Path("tmp/"+ src_dir).glob("**/*.txt")) + +# parse all child directories +data = [] +sources = [] +for p in ps: + with open(p) as f: + data.append(f.read()) + sources.append(p) + +# Here we split the documents, as needed, into smaller chunks. +# We do this due to the context limits of the LLMs. +text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n") +docs = [] +metadatas = [] +for i, d in enumerate(data): + splits = text_splitter.split_text(d) + docs.extend(splits) + metadatas.extend([{"source": sources[i]}] * len(splits)) + + +# Here we create a vector store from the documents and save it to disk. +store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas) +faiss.write_index(store.index, "docs.index") +store.index = None +with open("faiss_store.pkl", "wb") as f: + pickle.dump(store, f) + +# Delete tmp folder +# Commented out for now +#shutil.rmtree(dst_dir) \ No newline at end of file