LLM Eval Framework + Breaking LLMs

This commit is contained in:
Cole Medin
2024-08-28 18:01:58 -05:00
parent 3afefe9f61
commit ef5f6c7c43
9 changed files with 926 additions and 6 deletions

6
.gitignore vendored
View File

@@ -2,3 +2,9 @@ __pycache__
prep prep
.env .env
chroma_db chroma_db
test.py
data
creds
credentials
credentials.json
token.json

View File

@@ -1,10 +1,11 @@
python-dotenv==0.13.0 python-dotenv==0.13.0
langchain==0.2.6 langchain==0.2.12
langchain-anthropic==0.1.16 langchain-anthropic==0.1.22
langchain-community==0.2.6 langchain-community==0.2.11
langchain-core==0.2.10 langchain-core==0.2.28
langchain-openai==0.1.10 langchain-openai==0.1.20
langchain-chroma==0.1.2 langchain-chroma==0.1.2
langchain-huggingface==0.0.3
streamlit==1.36.0 streamlit==1.36.0
pdfminer.six==20240706 pdfminer.six==20240706
unstructured[all-docs] unstructured[all-docs]

View File

@@ -0,0 +1,43 @@
# Rename this file to .env once you have filled in the below environment variables!
# No environment variables for Google Drive access!
# See the below instructions for Google Drive authentication for Python:
# https://developers.google.com/drive/api/quickstart/python
# Get your GROQ API Key here -
# https://console.groq.com/keys
GROQ_API_KEY=
# Get your Open AI API Key by following these instructions -
# https://help.openai.com/en/articles/4936850-where-do-i-find-my-openai-api-key
# You only need this environment variable set if you set LLM_MODEL to a GPT model
OPENAI_API_KEY=
# Get your Anthropic API Key in your account settings -
# https://console.anthropic.com/settings/keys
# You only need this environment variable set if you set LLM_MODEL to a Claude model
ANTHROPIC_API_KEY=
# Get your Hugging Face API token here: https://huggingface.co/settings/tokens
# After creating an account with Hugging Face
# Then run huggingface-cli login and enter the token in there too after installing Hugging Face
HUGGINGFACEHUB_API_TOKEN=
# See all Open AI models you can use here -
# https://platform.openai.com/docs/models
# And all Anthropic models you can use here -
# https://docs.anthropic.com/en/docs/about-claude/models
# A good default to go with here is gpt-4o-mini, claude-3-5-sonnet-20240620, or llama3-groq-70b-8192-tool-use-preview
LLM_MODEL=gpt-4o-mini
# Get your personal Asana access token through the developer console in Asana.
# Feel free to follow these instructions -
# https://developers.asana.com/docs/personal-access-token
ASANA_ACCESS_TOKEN=
# The Asana workspace ID is in the URL when you visit your Asana Admin Console (when logged in).
# Go to the URL "https://app.asana.com/admin" and then your workspace ID
# will appear in the URL as a slew of digits once the site loads.
# If your URL is https://app.asana.com/admin/987654321/insights, then your
# Asana workspace ID is 987654321
ASANA_WORKPLACE_ID=

View File

@@ -0,0 +1,95 @@
from datetime import datetime
import streamlit as st
import asyncio
import json
import uuid
import os
from langchain_core.messages import SystemMessage, AIMessage, HumanMessage, ToolMessage
from runnable import get_runnable
@st.cache_resource
def create_chatbot_instance():
return get_runnable()
chatbot = create_chatbot_instance()
@st.cache_resource
def get_thread_id():
return str(uuid.uuid4())
thread_id = get_thread_id()
system_message = f"""
You are a personal assistant who helps manage tasks in Asana and documents in Google Drive.
You never give IDs to the user since those are just for you to keep track of.
The current date is: {datetime.now().date()}
"""
async def prompt_ai(messages):
config = {
"configurable": {
"thread_id": thread_id
}
}
async for event in chatbot.astream_events(
{"messages": messages}, config, version="v2"
):
if event["event"] == "on_chat_model_stream":
yield event["data"]["chunk"].content
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~ Main Function with UI Creation ~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
async def main():
st.title("LLM Eval Chatbot")
# Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = [
SystemMessage(content=system_message)
]
# Display chat messages from history on app rerun
for message in st.session_state.messages:
message_json = json.loads(message.json())
message_type = message_json["type"]
if message_type in ["human", "ai", "system"]:
with st.chat_message(message_type):
st.markdown(message_json["content"])
# React to user input
if prompt := st.chat_input("What would you like to do today?"):
# Display user message in chat message container
st.chat_message("user").markdown(prompt)
# Add user message to chat history
st.session_state.messages.append(HumanMessage(content=prompt))
# Display assistant response in chat message container
response_content = ""
with st.chat_message("assistant"):
message_placeholder = st.empty() # Placeholder for updating the message
# Run the async generator to fetch responses
async for chunk in prompt_ai(st.session_state.messages):
if isinstance(chunk, str):
response_content += chunk
elif isinstance(chunk, list):
for chunk_text in chunk:
if "text" in chunk_text:
response_content += chunk_text["text"]
else:
raise Exception("Chunk is not a string or list.")
# Update the placeholder with the current response content
message_placeholder.markdown(response_content)
st.session_state.messages.append(AIMessage(content=response_content))
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,19 @@
asana==5.0.7
python-dotenv==0.13.0
langchain==0.2.12
langchain-anthropic==0.1.22
langchain-groq==0.1.5
langchain-community==0.2.11
langchain-core==0.2.28
langchain-openai==0.1.20
langchain-chroma==0.1.2
langchain-huggingface==0.0.3
sentence-transformers==3.0.1
unstructured[all-docs]
streamlit==1.36.0
langgraph==0.1.19
aiosqlite==0.20.0
google-api-python-client==2.142.0
google-auth-oauthlib==1.2.1
oauthlib==3.2.2
requests-oauthlib==2.0.0

View File

@@ -0,0 +1,169 @@
from langgraph.graph.message import AnyMessage, add_messages
from langgraph.checkpoint.aiosqlite import AsyncSqliteSaver
from langchain_core.runnables import RunnableConfig
from langgraph.graph import END, StateGraph
from typing_extensions import TypedDict
from typing import Annotated, Literal, Dict
from dotenv import load_dotenv
import streamlit as st
import json
import os
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.messages import ToolMessage, AIMessage
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEndpoint, ChatHuggingFace
from tools.asana_tools import available_asana_functions
from tools.google_drive_tools import available_drive_functions
from tools.vector_db_tools import available_vector_db_functions
load_dotenv()
model = os.getenv('LLM_MODEL', 'gpt-4o')
model_mapping = {
"gpt": ChatOpenAI,
"claude": ChatAnthropic,
"groq": ChatGroq,
"llama": ChatHuggingFace # ChatHuggingFace doesn't work for tool calling yet with HuggingFaceEndpoint but will in the future
}
@st.cache_resource
def get_local_model():
return HuggingFaceEndpoint(
repo_id=model,
task="text-generation",
max_new_tokens=1024,
do_sample=False
)
# If you want to run the model absolutely locally - VERY resource intense!
# return HuggingFacePipeline.from_model_id(
# model_id=model,
# task="text-generation",
# pipeline_kwargs={
# "max_new_tokens": 1024,
# "top_k": 50,
# "temperature": 0.4
# },
# )
available_functions = available_asana_functions | available_drive_functions | available_vector_db_functions
tools = [tool for _, tool in available_functions.items()]
for key, chatbot_class in model_mapping.items():
if key in model.lower():
chatbot = chatbot_class(model=model) if key != "llama" else chatbot_class(llm=get_local_model())
break
chatbot_with_tools = chatbot.bind_tools(tools)
### State
class GraphState(TypedDict):
"""
Represents the state of our graph.
Attributes:
messages: List of chat messages.
"""
messages: Annotated[list[AnyMessage], add_messages]
async def call_model(state: GraphState, config: RunnableConfig) -> Dict[str, AnyMessage]:
"""
Function that calls the model to generate a response.
Args:
state (GraphState): The current graph state
Returns:
dict: The updated state with a new AI message
"""
print("---CALL MODEL---")
messages = list(filter(
lambda m: not isinstance(m, AIMessage) or hasattr(m, "response_metadata") and m.response_metadata,
state["messages"]
))
# Invoke the chatbot with the binded tools
response = await chatbot_with_tools.ainvoke(messages, config)
# print("Response from model:", response)
# We return an object because this will get added to the existing list
return {"messages": response}
def tool_node(state: GraphState) -> Dict[str, AnyMessage]:
"""
Function that handles all tool calls.
Args:
state (GraphState): The current graph state
Returns:
dict: The updated state with tool messages
"""
print("---TOOL NODE---")
messages = state["messages"]
last_message = messages[-1] if messages else None
outputs = []
if last_message and last_message.tool_calls:
for call in last_message.tool_calls:
tool = available_functions.get(call['name'], None)
if tool is None:
raise Exception(f"Tool '{call['name']}' not found.")
print(f"\n\nInvoking tool: {call['name']} with args {call['args']}")
output = tool.invoke(call['args'])
print(f"Result of invoking tool: {output}\n\n")
outputs.append(ToolMessage(
output if isinstance(output, str) else json.dumps(output),
tool_call_id=call['id']
))
return {'messages': outputs}
def should_continue(state: GraphState) -> Literal["__end__", "tools"]:
"""
Determine whether to continue or end the workflow based on if there are tool calls to make.
Args:
state (GraphState): The current graph state
Returns:
str: The next node to execute or END
"""
print("---SHOULD CONTINUE---")
messages = state["messages"]
last_message = messages[-1] if messages else None
# If there is no function call, then we finish
if not last_message or not last_message.tool_calls:
return END
else:
return "tools"
def get_runnable():
workflow = StateGraph(GraphState)
# Define the nodes and how they connect
workflow.add_node("agent", call_model)
workflow.add_node("tools", tool_node)
workflow.set_entry_point("agent")
workflow.add_conditional_edges(
"agent",
should_continue
)
workflow.add_edge("tools", "agent")
# Compile the LangGraph graph into a runnable
memory = AsyncSqliteSaver.from_conn_string(":memory:")
app = workflow.compile(checkpointer=memory)
return app

View File

@@ -0,0 +1,193 @@
import asana
from asana.rest import ApiException
from dotenv import load_dotenv
from datetime import datetime
import json
import os
from langchain_core.tools import tool
load_dotenv()
configuration = asana.Configuration()
configuration.access_token = os.getenv('ASANA_ACCESS_TOKEN', '')
api_client = asana.ApiClient(configuration)
# create an instance of the different Asana API classes
projects_api_instance = asana.ProjectsApi(api_client)
tasks_api_instance = asana.TasksApi(api_client)
workspace_gid = os.getenv("ASANA_WORKPLACE_ID", "")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~ AI Agent Tool Functions ~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@tool
def create_asana_task(task_name: str, project_gid: str, due_on: str ="today") -> str:
"""
Creates a task in Asana given the name of the task and when it is due
Example call:
create_asana_task("Test Task", "2024-06-24")
Args:
task_name (str): The name of the task in Asana
project_gid (str): The ID of the project to add the task to
due_on (str): The date the task is due in the format YYYY-MM-DD. If not given, the current day is used
Returns:
str: The API response of adding the task to Asana or an error message if the API call threw an error
"""
if due_on == "today":
due_on = str(datetime.now().date())
task_body = {
"data": {
"name": task_name,
"due_on": due_on,
"projects": [project_gid]
}
}
try:
api_response = tasks_api_instance.create_task(task_body, {})
return json.dumps(api_response, indent=2)
except ApiException as e:
return f"Exception when calling TasksApi->create_task: {e}"
@tool
def get_asana_projects() -> str:
"""
Gets all of the projects in the user's Asana workspace
Returns:
str: The API response from getting the projects or an error message if the projects couldn't be fetched.
The API response is an array of project objects, where each project object looks like:
{'gid': '1207789085525921', 'name': 'Project Name', 'resource_type': 'project'}
"""
opts = {
'limit': 50, # int | Results per page. The number of objects to return per page. The value must be between 1 and 100.
'workspace': workspace_gid, # str | The workspace or organization to filter projects on.
'archived': False # bool | Only return projects whose `archived` field takes on the value of this parameter.
}
try:
api_response = projects_api_instance.get_projects(opts)
return json.dumps(list(api_response), indent=2)
except ApiException as e:
return "Exception when calling ProjectsApi->create_project: %s\n" % e
@tool
def create_asana_project(project_name: str, due_on=None) -> str:
"""
Creates a project in Asana given the name of the project and optionally when it is due
Example call:
create_asana_project("Test Project", "2024-06-24")
Args:
project_name (str): The name of the project in Asana
due_on (str): The date the project is due in the format YYYY-MM-DD. If not supplied, the project is not given a due date
Returns:
str: The API response of adding the project to Asana or an error message if the API call threw an error
"""
body = {
"data": {
"name": project_name, "due_on": due_on, "workspace": workspace_gid
}
} # dict | The project to create.
try:
# Create a project
api_response = projects_api_instance.create_project(body, {})
return json.dumps(api_response, indent=2)
except ApiException as e:
return "Exception when calling ProjectsApi->create_project: %s\n" % e
@tool
def get_asana_tasks(project_gid: str) -> str:
"""
Gets all the Asana tasks in a project
Example call:
get_asana_tasks("1207789085525921")
Args:
project_gid (str): The ID of the project in Asana to fetch the tasks for
Returns:
str: The API response from fetching the tasks for the project in Asana or an error message if the API call threw an error
The API response is an array of tasks objects where each task object is in the format:
{'gid': '1207780961742158', 'created_at': '2024-07-11T16:25:46.380Z', 'due_on': None or date in format "YYYY-MM-DD", 'name': 'Test Task'}
"""
opts = {
'limit': 50, # int | Results per page. The number of objects to return per page. The value must be between 1 and 100.
'project': project_gid, # str | The project to filter tasks on.
'opt_fields': "created_at,name,due_on", # list[str] | This endpoint returns a compact resource, which excludes some properties by default. To include those optional properties, set this query parameter to a comma-separated list of the properties you wish to include.
}
try:
# Get multiple tasks
api_response = tasks_api_instance.get_tasks(opts)
return json.dumps(list(api_response), indent=2)
except ApiException as e:
return "Exception when calling TasksApi->get_tasks: %s\n" % e
@tool
def update_asana_task(task_gid: str, data: dict) -> str:
"""
Updates a task in Asana by updating one or both of completed and/or the due date
Example call:
update_asana_task("1207780961742158", {"completed": True, "due_on": "2024-07-13"})
Args:
task_gid (str): The ID of the task to update
data (dict): A dictionary with either one or both of the keys 'completed' and/or 'due_on'
If given, completed needs to be either True or False.
If given, the due date needs to be in the format 'YYYY-MM-DD'.
Returns:
str: The API response of updating the task or an error message if the API call threw an error
"""
# Data: {"completed": True or False, "due_on": "YYYY-MM-DD"}
body = {"data": data} # dict | The task to update.
try:
# Update a task
api_response = tasks_api_instance.update_task(body, task_gid, {})
return json.dumps(api_response, indent=2)
except ApiException as e:
return "Exception when calling TasksApi->update_task: %s\n" % e
@tool
def delete_task(task_gid: str) -> str:
"""
Deletes a task in Asana
Example call:
delete_task("1207780961742158")
Args:
task_gid (str): The ID of the task to delete
Returns:
str: The API response of deleting the task or an error message if the API call threw an error
"""
try:
# Delete a task
api_response = tasks_api_instance.delete_task(task_gid)
return json.dumps(api_response, indent=2)
except ApiException as e:
return "Exception when calling TasksApi->delete_task: %s\n" % e
# Maps the function names to the actual function object in the script
# This mapping will also be used to create the list of tools to bind to the agent
available_asana_functions = {
"create_asana_task": create_asana_task,
"get_asana_projects": get_asana_projects,
"create_asana_project": create_asana_project,
"get_asana_tasks": get_asana_tasks,
"update_asana_task": update_asana_task,
"delete_task": delete_task
}

View File

@@ -0,0 +1,273 @@
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
from langchain_core.tools import tool
import streamlit as st
import os
import io
SCOPES = [
'https://www.googleapis.com/auth/drive',
'https://www.googleapis.com/auth/drive.file'
]
@st.cache_resource
def get_google_drive_service():
"""
Gets the Google Drive credentials with the scope of full access to Drive files
"""
creds = None
if os.path.exists("token.json"):
creds = Credentials.from_authorized_user_file("token.json", SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
"credentials/credentials.json", SCOPES
)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open("token.json", "w") as token:
token.write(creds.to_json())
return build("drive", "v3", credentials=creds)
service = get_google_drive_service()
@tool
def search_file(query: str) -> list:
"""
Searches for files in Google Drive based on a query string.
Arguments:
- query (str): The search query to find files. This requires a specific format for Google Drive:
To search for files that have 'example' in the name - query should be: name contains 'example'
To search for files that have 'example text' in the file text - query should be: fullText contains 'example text'
Returns:
- list: A list of dictionaries containing the file ID and name of the matched files.
Example usage:
search_file("name contains 'report'")
"""
try:
results = service.files().list(q=f"mimeType!='application/vnd.google-apps.folder' and {query}", spaces='drive', fields="files(id, name)").execute()
return str(results.get('files', []))
except Exception as e:
return f"Failed to search Google Drive: {e}"
@tool
def download_file(file_id: str, file_name: str, mime_type: str = 'text/plain') -> str:
"""
Downloads a Google Docs file (or similar) from Google Drive and saves it to a specified path.
Arguments:
- file_id (str): The unique ID of the file to be downloaded.
- file_name (str): The name of the file (including the extension) to download it locally as.
- mime_type (str, optional): The MIME type to export the file as. Defaults to 'text/plain'.
Returns:
- str: A message confirming the file has been downloaded to the specified path.
Example usage:
download_file("1aBcDeFgHiJkLmNoPqRsTuVwXyZ", "file.txt", "text/plain")
"""
try:
directory = "data"
if not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
request = service.files().export_media(fileId=file_id, mimeType=mime_type)
file_path = f"{directory}/{file_name}"
with io.FileIO(file_path, 'wb') as file:
downloader = MediaIoBaseDownload(file, request)
done = False
while not done:
status, done = downloader.next_chunk()
return f"File downloaded to {file_path}"
except Exception as e:
return f"Error downloading the file: {e}"
@tool
def upload_file(file_path: str, folder_id: str = None) -> str:
"""
Uploads a file to a specific folder in Google Drive. If no folder ID is provided, it uploads to the root directory.
Arguments:
- file_path (str): The local path to the file that will be uploaded.
- folder_id (str, optional): The ID of the Google Drive folder where the file will be uploaded. Defaults to None (uploads to root).
Returns:
- str: The ID of the uploaded file.
Example usage:
upload_file("/path/to/local/file.txt", "1aBcDeFgHiJkLmNoPqRsTuVwXyZ")
"""
try:
file_metadata = {'name': file_path.split("/")[-1]}
if folder_id:
file_metadata['parents'] = [folder_id]
media = MediaFileUpload(file_path, resumable=True)
file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
return f"File uploaded with ID: {file.get('id')}"
except Exception as e:
return f"Error uploading the file: {e}"
@tool
def delete_file(file_id: str) -> str:
"""
Deletes a file from Google Drive based on its file ID.
Arguments:
- file_id (str): The unique ID of the file to be deleted.
Returns:
- str: A message confirming the deletion of the file.
Example usage:
delete_file("1aBcDeFgHiJkLmNoPqRsTuVwXyZ")
"""
try:
service.files().delete(fileId=file_id).execute()
return f"File with ID {file_id} has been deleted."
except Exception as e:
return f"Error deleting the file: {e}"
@tool
def update_file(file_id: str, new_file_path: str) -> str:
"""
Updates the contents of a file in Google Drive by replacing it with a new file.
Arguments:
- file_id (str): The unique ID of the file to be updated.
- new_file_path (str): The local path to the new file that will replace the existing file.
Returns:
- str: A message confirming the file has been updated.
Example usage:
update_file("1aBcDeFgHiJkLmNoPqRsTuVwXyZ", "/path/to/new/file.txt")
"""
try:
media = MediaFileUpload(new_file_path, resumable=True)
updated_file = service.files().update(fileId=file_id, media_body=media).execute()
return f"File with ID {file_id} has been updated."
except Exception as e:
return f"Error updating the file: {e}"
@tool
def search_folder(query: str) -> list:
"""
Searches for folders in Google Drive based on a query string.
Arguments:
- query (str): The search query to find folders - just the name or part of the name of folder(s) to search for.
Returns:
- list: A list of dictionaries containing the folder ID and name of the matched folders.
Example usage:
search_folder("name contains 'meeting_notes'")
"""
try:
results = service.files().list(q=f"mimeType='application/vnd.google-apps.folder' and name contains '{query}'",
spaces='drive', fields="files(id, name)").execute()
return str(results.get('files', []))
except Exception as e:
return f"Error searching folders: {e}"
@tool
def create_folder(folder_name: str, parent_folder_id: str = None) -> str:
"""
Creates a folder in Google Drive. If a parent folder ID is provided, the folder is created inside that folder.
Arguments:
- folder_name (str): The name of the folder to be created.
- parent_folder_id (str, optional): The ID of the parent folder where the new folder will be created. Defaults to None (creates in root).
Returns:
- str: The ID of the created folder.
Example usage:
create_folder("New Meeting Folder", "1aBcDeFgHiJkLmNoPqRsTuVwXyZ")
"""
try:
file_metadata = {
'name': folder_name,
'mimeType': 'application/vnd.google-apps.folder'
}
if parent_folder_id:
file_metadata['parents'] = [parent_folder_id]
folder = service.files().create(body=file_metadata, fields='id').execute()
return f"Folder created with ID: {folder.get('id')}"
except Exception as e:
return f"Error creating the folder: {e}"
@tool
def delete_folder(folder_id: str) -> str:
"""
Deletes a folder from Google Drive based on its folder ID.
Arguments:
- folder_id (str): The unique ID of the folder to be deleted.
Returns:
- str: A message confirming the deletion of the folder.
Example usage:
delete_folder("1aBcDeFgHiJkLmNoPqRsTuVwXyZ")
"""
try:
service.files().delete(fileId=folder_id).execute()
return f"Folder with ID {folder_id} has been deleted."
except Exception as e:
return f"Error deleting the folder: {e}"
@tool
def create_text_file(content: str, file_name: str) -> str:
"""
Creates a text file with the given content + file name and returns the file path.
Arguments:
- content (str): The text content to be written to the file.
- file_name (str): The name of the file to be created (including the file extension, typically .txt).
Returns:
- str: The path to the created text file.
Example usage:
create_text_file("Hello, world!", "example.txt")
"""
try:
directory = "data"
if not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
file_path = f"{directory}/{file_name}"
with open(file_path, "w") as file:
file.write(content)
return file_path
except Exception as e:
return f"Error creating the text file: {e}"
# Maps the function names to the actual function object in the script
# This mapping will also be used to create the list of tools to bind to the agent
available_drive_functions = {
"search_file": search_file,
"download_file": download_file,
"upload_file": upload_file,
"delete_file": delete_file,
"update_file": update_file,
"search_folder": search_folder,
"create_folder": create_folder,
"delete_folder": delete_folder,
"create_text_file": create_text_file
}

View File

@@ -0,0 +1,121 @@
import streamlit as st
import hashlib
import re
from langchain_core.tools import tool
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
@st.cache_resource
def get_chroma_instance():
# Create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Get the Chroma instance from what is saved to the disk
return Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
db = get_chroma_instance()
def string_to_vector_id(input_string: str, max_length: int = 64) -> str:
"""
Converts a string into a vector-friendly ID by removing special characters,
replacing spaces with underscores, and optionally hashing the string if it exceeds max length.
Arguments:
- input_string (str): The input string to convert to a vector ID.
- max_length (int, optional): The maximum length of the vector ID. Defaults to 64 characters.
Returns:
- str: A string that can be used as a vector ID.
Example usage:
string_to_vector_id("Example String For Vector ID")
"""
# Remove non-alphanumeric characters (except spaces and underscores)
sanitized_string = re.sub(r'[^a-zA-Z0-9\s_]', '', input_string)
# Replace spaces with underscores
sanitized_string = sanitized_string.replace(" ", "_")
# Truncate if necessary
if len(sanitized_string) > max_length:
# If the string is too long, hash it to fit within the max length
hash_object = hashlib.sha256(sanitized_string.encode())
sanitized_string = hash_object.hexdigest()[:max_length]
return sanitized_string
@tool
def query_documents(question: str) -> str:
"""
Uses RAG to query documents for information to answer a question
that requires specific context that could be found in documents
Example call:
query_documents("What are the action items from the meeting on the 20th?")
Args:
question (str): The question the user asked that might be answerable from the searchable documents
Returns:
str: The list of texts (and their sources) that matched with the question the closest using RAG
"""
try:
similar_docs = db.similarity_search(question, k=3)
docs_formatted = list(map(lambda doc: f"Source: {doc.metadata.get('source', 'NA')}\nContent: {doc.page_content}", similar_docs))
return str(docs_formatted)
except Exception as e:
return f"Error querying the vector DB: {e}"
@tool
def add_doc_to_knowledgebase(file_path: str) -> str:
"""
Adds a local document to the vector DB knowledgbase for RAG.
This function can only be called on local documents - Google Drive docs must be downloaded first.
The content of the file is put in the vector DB with the metadata
including the file source. ID is randomly generated.
Example call:
add_doc_to_knowledgebase("/path/to/local/file")
Args:
file_path (str): The local path to the file to add to the knowledgebase (NOT Google Drive)
Returns:
str: The success of the operation of adding the document to the vector DB
"""
try:
loader = TextLoader(file_path)
doc_arr = loader.load()
db.add_documents(documents=doc_arr, ids=[string_to_vector_id(file_path.split("/")[-1])])
return "Successfully added the file to the knowledgebase."
except Exception as e:
return f"Error adding file to knowledgbase: {e}"
@tool
def clear_knowledgebase() -> str:
"""
Removes all documents from the vector DB knowledgebase to clear it.
Example call:
clear_knowledgebase()
Returns:
str: The success of the operation of clearing the vector DB
"""
try:
db.reset_collection()
return "Successfully cleared the knowledgebase."
except Exception as e:
return f"Error clearing the knowledgbase: {e}"
# Maps the function names to the actual function object in the script
# This mapping will also be used to create the list of tools to bind to the agent
available_vector_db_functions = {
"query_documents": query_documents,
"add_doc_to_knowledgebase": add_doc_to_knowledgebase,
"clear_knowledgebase": clear_knowledgebase
}