mirror of
https://github.com/coleam00/ai-agents-masterclass.git
synced 2025-11-29 16:43:14 +00:00
120 lines
4.4 KiB
Python
120 lines
4.4 KiB
Python
import hashlib
|
|
import re
|
|
|
|
from langchain_core.tools import tool
|
|
from langchain_community.document_loaders import TextLoader
|
|
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
|
from langchain_community.document_loaders import DirectoryLoader
|
|
from langchain_text_splitters import CharacterTextSplitter
|
|
from langchain_chroma import Chroma
|
|
|
|
def get_chroma_instance():
|
|
# Create the open-source embedding function
|
|
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
|
|
|
# Get the Chroma instance from what is saved to the disk
|
|
return Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
|
|
|
|
db = get_chroma_instance()
|
|
|
|
def string_to_vector_id(input_string: str, max_length: int = 64) -> str:
|
|
"""
|
|
Converts a string into a vector-friendly ID by removing special characters,
|
|
replacing spaces with underscores, and optionally hashing the string if it exceeds max length.
|
|
|
|
Arguments:
|
|
- input_string (str): The input string to convert to a vector ID.
|
|
- max_length (int, optional): The maximum length of the vector ID. Defaults to 64 characters.
|
|
|
|
Returns:
|
|
- str: A string that can be used as a vector ID.
|
|
|
|
Example usage:
|
|
string_to_vector_id("Example String For Vector ID")
|
|
"""
|
|
# Remove non-alphanumeric characters (except spaces and underscores)
|
|
sanitized_string = re.sub(r'[^a-zA-Z0-9\s_]', '', input_string)
|
|
|
|
# Replace spaces with underscores
|
|
sanitized_string = sanitized_string.replace(" ", "_")
|
|
|
|
# Truncate if necessary
|
|
if len(sanitized_string) > max_length:
|
|
# If the string is too long, hash it to fit within the max length
|
|
hash_object = hashlib.sha256(sanitized_string.encode())
|
|
sanitized_string = hash_object.hexdigest()[:max_length]
|
|
|
|
return sanitized_string
|
|
|
|
@tool
|
|
def query_documents(question: str) -> str:
|
|
"""
|
|
Uses RAG to query documents for information to answer a question
|
|
that requires specific context that could be found in documents
|
|
|
|
Example call:
|
|
|
|
query_documents("What are the action items from the meeting on the 20th?")
|
|
Args:
|
|
question (str): The question the user asked that might be answerable from the searchable documents
|
|
Returns:
|
|
str: The list of texts (and their sources) that matched with the question the closest using RAG
|
|
"""
|
|
try:
|
|
similar_docs = db.similarity_search(question, k=3)
|
|
docs_formatted = list(map(lambda doc: f"Source: {doc.metadata.get('source', 'NA')}\nContent: {doc.page_content}", similar_docs))
|
|
|
|
return str(docs_formatted)
|
|
except Exception as e:
|
|
return f"Error querying the vector DB: {e}"
|
|
|
|
@tool
|
|
def add_doc_to_knowledgebase(file_path: str) -> str:
|
|
"""
|
|
Adds a local document to the vector DB knowledgbase for RAG.
|
|
This function can only be called on local documents - Google Drive docs must be downloaded first.
|
|
The content of the file is put in the vector DB with the metadata
|
|
including the file source. ID is randomly generated.
|
|
|
|
Example call:
|
|
|
|
add_doc_to_knowledgebase("/path/to/local/file")
|
|
Args:
|
|
file_path (str): The local path to the file to add to the knowledgebase (NOT Google Drive)
|
|
Returns:
|
|
str: The success of the operation of adding the document to the vector DB
|
|
"""
|
|
try:
|
|
loader = TextLoader(file_path)
|
|
doc_arr = loader.load()
|
|
db.add_documents(documents=doc_arr, ids=[string_to_vector_id(file_path.split("/")[-1])])
|
|
return "Successfully added the file to the knowledgebase."
|
|
except Exception as e:
|
|
return f"Error adding file to knowledgbase: {e}"
|
|
|
|
@tool
|
|
def clear_knowledgebase() -> str:
|
|
"""
|
|
Removes all documents from the vector DB knowledgebase to clear it.
|
|
|
|
Example call:
|
|
|
|
clear_knowledgebase()
|
|
Returns:
|
|
str: The success of the operation of clearing the vector DB
|
|
"""
|
|
try:
|
|
db.reset_collection()
|
|
return "Successfully cleared the knowledgebase."
|
|
except Exception as e:
|
|
return f"Error clearing the knowledgbase: {e}"
|
|
|
|
|
|
# Maps the function names to the actual function object in the script
|
|
# This mapping will also be used to create the list of tools to bind to the agent
|
|
available_vector_db_functions = {
|
|
"query_documents": query_documents,
|
|
"add_doc_to_knowledgebase": add_doc_to_knowledgebase,
|
|
"clear_knowledgebase": clear_knowledgebase
|
|
}
|