Bulk ingest

Added a method based on indexGPT folder ingester. Additional rst reader included.
2025-11-30 00:53:14 +00:00 · 2023-02-10 19:44:42 +04:00
parent 5038de06bb
commit 79b5ef9c14
13 changed files with 962 additions and 0 deletions
--- a/scripts/ingest_bulk/ingest.py
+++ b/scripts/ingest_bulk/ingest.py
@@ -0,0 +1,37 @@
+import sys
+import nltk
+import dotenv
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+from parser.file.bulk import SimpleDirectoryReader
+from parser.schema.base import Document
+from parser.open_ai_func import call_openai_api, get_user_permission
+
+dotenv.load_dotenv()
+
+#Specify your folder HERE
+directory_to_ingest = 'data_test'
+
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+
+#Splits all files in specified folder to documents
+raw_docs = SimpleDirectoryReader(input_dir=directory_to_ingest).load_data()
+raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
+# Here we split the documents, as needed, into smaller chunks.
+# We do this due to the context limits of the LLMs.
+text_splitter = RecursiveCharacterTextSplitter()
+docs = text_splitter.split_documents(raw_docs)
+
+# Here we check for command line arguments for bot calls.
+# If no argument exists or the permission_bypass_flag argument is not '-y',
+# user permission is requested to call the API.
+if len(sys.argv) > 1:
+    permission_bypass_flag = sys.argv[1]
+    if permission_bypass_flag == '-y':
+        call_openai_api(docs)
+    else:
+        get_user_permission(docs)
+else:
+    get_user_permission(docs)