diff --git a/scripts/test_ingestion.py b/scripts/test_ingestion.py new file mode 100644 index 00000000..e4aad822 --- /dev/null +++ b/scripts/test_ingestion.py @@ -0,0 +1,19 @@ +import os + +import dotenv +import tiktoken +from langchain import FAISS +from langchain.embeddings import OpenAIEmbeddings + +dotenv.load_dotenv() +embeddings_key = os.getenv("API_KEY") +docsearch = FAISS.load_local('outputs/inputs', OpenAIEmbeddings(openai_api_key=embeddings_key)) + +d1 = docsearch.similarity_search("Whats new in 1.5.3?") +print(d1) +print("=====================================") +print("=====================================") +for i in d1: + print("docs length (tokens)") + doc_len = len(tiktoken.get_encoding("cl100k_base").encode(i.page_content)) + print(doc_len)