mirror of
https://github.com/coleam00/ai-agents-masterclass.git
synced 2025-11-29 08:33:16 +00:00
35 lines
1.1 KiB
Python
35 lines
1.1 KiB
Python
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
|
from langchain_community.document_loaders import DirectoryLoader
|
|
from langchain_text_splitters import CharacterTextSplitter
|
|
from langchain_chroma import Chroma
|
|
from dotenv import load_dotenv
|
|
import os
|
|
|
|
load_dotenv()
|
|
|
|
rag_directory = os.getenv('DIRECTORY', 'meeting_notes')
|
|
|
|
def load_documents(directory):
|
|
# Load the PDF or txt documents from the directory
|
|
loader = DirectoryLoader(directory)
|
|
documents = loader.load()
|
|
|
|
# Split the documents into chunks
|
|
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
|
docs = text_splitter.split_documents(documents)
|
|
|
|
return docs
|
|
|
|
def main():
|
|
# Get the documents split into chunks
|
|
docs = load_documents(rag_directory)
|
|
|
|
# Create the open-source embedding function
|
|
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
|
|
|
# Load the documents into Chroma and save it to the disk
|
|
Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |