mirror of
https://github.com/coleam00/ai-agents-masterclass.git
synced 2025-11-29 08:33:16 +00:00
LLM Eval Framework + Breaking LLMs
This commit is contained in:
121
llm-agent-evaluation-framework/tools/vector_db_tools.py
Normal file
121
llm-agent-evaluation-framework/tools/vector_db_tools.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import streamlit as st
|
||||
import hashlib
|
||||
import re
|
||||
|
||||
from langchain_core.tools import tool
|
||||
from langchain_community.document_loaders import TextLoader
|
||||
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
||||
from langchain_community.document_loaders import DirectoryLoader
|
||||
from langchain_text_splitters import CharacterTextSplitter
|
||||
from langchain_chroma import Chroma
|
||||
|
||||
@st.cache_resource
|
||||
def get_chroma_instance():
|
||||
# Create the open-source embedding function
|
||||
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
||||
|
||||
# Get the Chroma instance from what is saved to the disk
|
||||
return Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
|
||||
|
||||
db = get_chroma_instance()
|
||||
|
||||
def string_to_vector_id(input_string: str, max_length: int = 64) -> str:
|
||||
"""
|
||||
Converts a string into a vector-friendly ID by removing special characters,
|
||||
replacing spaces with underscores, and optionally hashing the string if it exceeds max length.
|
||||
|
||||
Arguments:
|
||||
- input_string (str): The input string to convert to a vector ID.
|
||||
- max_length (int, optional): The maximum length of the vector ID. Defaults to 64 characters.
|
||||
|
||||
Returns:
|
||||
- str: A string that can be used as a vector ID.
|
||||
|
||||
Example usage:
|
||||
string_to_vector_id("Example String For Vector ID")
|
||||
"""
|
||||
# Remove non-alphanumeric characters (except spaces and underscores)
|
||||
sanitized_string = re.sub(r'[^a-zA-Z0-9\s_]', '', input_string)
|
||||
|
||||
# Replace spaces with underscores
|
||||
sanitized_string = sanitized_string.replace(" ", "_")
|
||||
|
||||
# Truncate if necessary
|
||||
if len(sanitized_string) > max_length:
|
||||
# If the string is too long, hash it to fit within the max length
|
||||
hash_object = hashlib.sha256(sanitized_string.encode())
|
||||
sanitized_string = hash_object.hexdigest()[:max_length]
|
||||
|
||||
return sanitized_string
|
||||
|
||||
@tool
|
||||
def query_documents(question: str) -> str:
|
||||
"""
|
||||
Uses RAG to query documents for information to answer a question
|
||||
that requires specific context that could be found in documents
|
||||
|
||||
Example call:
|
||||
|
||||
query_documents("What are the action items from the meeting on the 20th?")
|
||||
Args:
|
||||
question (str): The question the user asked that might be answerable from the searchable documents
|
||||
Returns:
|
||||
str: The list of texts (and their sources) that matched with the question the closest using RAG
|
||||
"""
|
||||
try:
|
||||
similar_docs = db.similarity_search(question, k=3)
|
||||
docs_formatted = list(map(lambda doc: f"Source: {doc.metadata.get('source', 'NA')}\nContent: {doc.page_content}", similar_docs))
|
||||
|
||||
return str(docs_formatted)
|
||||
except Exception as e:
|
||||
return f"Error querying the vector DB: {e}"
|
||||
|
||||
@tool
|
||||
def add_doc_to_knowledgebase(file_path: str) -> str:
|
||||
"""
|
||||
Adds a local document to the vector DB knowledgbase for RAG.
|
||||
This function can only be called on local documents - Google Drive docs must be downloaded first.
|
||||
The content of the file is put in the vector DB with the metadata
|
||||
including the file source. ID is randomly generated.
|
||||
|
||||
Example call:
|
||||
|
||||
add_doc_to_knowledgebase("/path/to/local/file")
|
||||
Args:
|
||||
file_path (str): The local path to the file to add to the knowledgebase (NOT Google Drive)
|
||||
Returns:
|
||||
str: The success of the operation of adding the document to the vector DB
|
||||
"""
|
||||
try:
|
||||
loader = TextLoader(file_path)
|
||||
doc_arr = loader.load()
|
||||
db.add_documents(documents=doc_arr, ids=[string_to_vector_id(file_path.split("/")[-1])])
|
||||
return "Successfully added the file to the knowledgebase."
|
||||
except Exception as e:
|
||||
return f"Error adding file to knowledgbase: {e}"
|
||||
|
||||
@tool
|
||||
def clear_knowledgebase() -> str:
|
||||
"""
|
||||
Removes all documents from the vector DB knowledgebase to clear it.
|
||||
|
||||
Example call:
|
||||
|
||||
clear_knowledgebase()
|
||||
Returns:
|
||||
str: The success of the operation of clearing the vector DB
|
||||
"""
|
||||
try:
|
||||
db.reset_collection()
|
||||
return "Successfully cleared the knowledgebase."
|
||||
except Exception as e:
|
||||
return f"Error clearing the knowledgbase: {e}"
|
||||
|
||||
|
||||
# Maps the function names to the actual function object in the script
|
||||
# This mapping will also be used to create the list of tools to bind to the agent
|
||||
available_vector_db_functions = {
|
||||
"query_documents": query_documents,
|
||||
"add_doc_to_knowledgebase": add_doc_to_knowledgebase,
|
||||
"clear_knowledgebase": clear_knowledgebase
|
||||
}
|
||||
Reference in New Issue
Block a user