Korvus RAG testing... cool example of using a tool for easy RAG with Python

This commit is contained in:
Cole Medin
2024-09-20 08:38:23 -05:00
parent 13a1969254
commit 9a3111d578
3 changed files with 69 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
# Rename this file to .env once you have filled in the below environment variables!
# PostgresML database connection string. See Korvus documentation for getting this set up.
KORVUS_DATABASE_URL=postgres://user:password@sql.cloud.postgresml.org:6432/korvus_database

View File

@@ -0,0 +1,61 @@
from korvus import Collection, Pipeline
from datasets import load_dataset
from time import time
from dotenv import load_dotenv
from rich.console import Console
import asyncio
import os
async def main():
load_dotenv()
os.environ["RUST_BACKTRACE"] = "1"
console = Console()
# Initialize collection
collection = Collection("squad")
# Create and add pipeline
pipeline = Pipeline(
"squadv1",
{
"text": {
"splitter": {"model": "recursive_character"},
"semantic_search": {"model": "intfloat/e5-small-v2"},
}
},
)
await collection.add_pipeline(pipeline)
# Prep documents for upserting
data = load_dataset("squad", split="train")
data = data.to_pandas()
data = data.drop_duplicates(subset=["context"])
documents = [
{"id": r["id"], "text": r["context"], "title": r["title"]}
for r in data.to_dict(orient="records")
]
print(len(documents))
# Upsert documents
await collection.upsert_documents(documents[:200])
# Query for answer
query = "Who won more than 20 grammy awards?"
console.print("Querying for context ...")
start = time()
results = await collection.vector_search(
{"query": {"fields": {"text": {"query": query}}}, "limit": 5}, pipeline
)
end = time()
console.print("\n Results for '%s' " % (query), style="bold")
console.print(results)
console.print("Query time = %0.3f" % (end - start))
# Archive collection
await collection.archive()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,4 @@
korvus==1.1.2
asyncio==3.4.3
python-dotenv==0.13.0
datasets==3.0.0