init2

2026-04-29 05:20:26 +00:00 · 2023-02-03 12:45:29 +00:00
parent 2135b8420f
commit b71a9bf5ee
14 changed files with 1683 additions and 0 deletions
--- a/scripts/ingest_rst.py
+++ b/scripts/ingest_rst.py
@@ -0,0 +1,40 @@
+from pathlib import Path
+from langchain.text_splitter import CharacterTextSplitter
+import faiss
+from langchain.vectorstores import FAISS
+from langchain.embeddings import OpenAIEmbeddings
+import pickle
+import dotenv
+import os
+
+dotenv.load_dotenv()
+
+
+# Here we load in the data in the format that Notion exports it in.
+ps = list(Path("pandasdocs/").glob("**/*.rst"))
+# parse all child directories
+
+data = []
+sources = []
+for p in ps:
+    with open(p) as f:
+        data.append(f.read())
+    sources.append(p)
+
+# Here we split the documents, as needed, into smaller chunks.
+# We do this due to the context limits of the LLMs.
+text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
+docs = []
+metadatas = []
+for i, d in enumerate(data):
+    splits = text_splitter.split_text(d)
+    docs.extend(splits)
+    metadatas.extend([{"source": sources[i]}] * len(splits))
+
+
+# Here we create a vector store from the documents and save it to disk.
+store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
+faiss.write_index(store.index, "docs.index")
+store.index = None
+with open("faiss_store.pkl", "wb") as f:
+    pickle.dump(store, f)