ai-agents-masterclass/llm-agent-evaluation-framework/tools/vector_db_tools.py

import streamlit as st
import hashlib
import re

from langchain_core.tools import tool
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

@st.cache_resource
def get_chroma_instance():
    # Create the open-source embedding function
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    # Get the Chroma instance from what is saved to the disk
    return Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)

db = get_chroma_instance()

def string_to_vector_id(input_string: str, max_length: int = 64) -> str:
    """
    Converts a string into a vector-friendly ID by removing special characters,
    replacing spaces with underscores, and optionally hashing the string if it exceeds max length.

    Arguments:
    - input_string (str): The input string to convert to a vector ID.
    - max_length (int, optional): The maximum length of the vector ID. Defaults to 64 characters.

    Returns:
    - str: A string that can be used as a vector ID.

    Example usage:
    string_to_vector_id("Example String For Vector ID")
    """
    # Remove non-alphanumeric characters (except spaces and underscores)
    sanitized_string = re.sub(r'[^a-zA-Z0-9\s_]', '', input_string)

    # Replace spaces with underscores
    sanitized_string = sanitized_string.replace(" ", "_")

    # Truncate if necessary
    if len(sanitized_string) > max_length:
        # If the string is too long, hash it to fit within the max length
        hash_object = hashlib.sha256(sanitized_string.encode())
        sanitized_string = hash_object.hexdigest()[:max_length]

    return sanitized_string

@tool
def query_documents(question: str) -> str:
    """
    Uses RAG to query documents for information to answer a question
    that requires specific context that could be found in documents

    Example call:

    query_documents("What are the action items from the meeting on the 20th?")
    Args:
        question (str): The question the user asked that might be answerable from the searchable documents
    Returns:
        str: The list of texts (and their sources) that matched with the question the closest using RAG
    """
    try:
        similar_docs = db.similarity_search(question, k=3)
        docs_formatted = list(map(lambda doc: f"Source: {doc.metadata.get('source', 'NA')}\nContent: {doc.page_content}", similar_docs))

        return str(docs_formatted)
    except Exception as e:
        return f"Error querying the vector DB: {e}"

@tool
def add_doc_to_knowledgebase(file_path: str) -> str:
    """
    Adds a local document to the vector DB knowledgbase for RAG.
    This function can only be called on local documents - Google Drive docs must be downloaded first.
    The content of the file is put in the vector DB with the metadata
    including the file source. ID is randomly generated.

    Example call:

    add_doc_to_knowledgebase("/path/to/local/file")
    Args:
        file_path (str): The local path to the file to add to the knowledgebase (NOT Google Drive)
    Returns:
        str: The success of the operation of adding the document to the vector DB
    """
    try:
        loader = TextLoader(file_path)
        doc_arr = loader.load()
        db.add_documents(documents=doc_arr, ids=[string_to_vector_id(file_path.split("/")[-1])])
        return "Successfully added the file to the knowledgebase."
    except Exception as e:
        return f"Error adding file to knowledgbase: {e}"

@tool
def clear_knowledgebase() -> str:
    """
    Removes all documents from the vector DB knowledgebase to clear it.

    Example call:

    clear_knowledgebase()
    Returns:
        str: The success of the operation of clearing the vector DB
    """
    try:
        db.reset_collection()
        return "Successfully cleared the knowledgebase."
    except Exception as e:
        return f"Error clearing the knowledgbase: {e}"


# Maps the function names to the actual function object in the script
# This mapping will also be used to create the list of tools to bind to the agent
available_vector_db_functions = {
    "query_documents": query_documents,
    "add_doc_to_knowledgebase": add_doc_to_knowledgebase,
    "clear_knowledgebase": clear_knowledgebase
}