Files
ai-agents-masterclass/6-rag-task-agent/rag-document-loader.py
2024-08-02 14:47:55 -05:00

35 lines
1.1 KiB
Python

from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
from dotenv import load_dotenv
import os
load_dotenv()
rag_directory = os.getenv('DIRECTORY', 'meeting_notes')
def load_documents(directory):
# Load the PDF or txt documents from the directory
loader = DirectoryLoader(directory)
documents = loader.load()
# Split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
return docs
def main():
# Get the documents split into chunks
docs = load_documents(rag_directory)
# Create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Load the documents into Chroma and save it to the disk
Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")
if __name__ == "__main__":
main()