diff --git a/korvus-simple-rag/.env.example b/korvus-simple-rag/.env.example new file mode 100644 index 0000000..b02335b --- /dev/null +++ b/korvus-simple-rag/.env.example @@ -0,0 +1,4 @@ +# Rename this file to .env once you have filled in the below environment variables! + +# PostgresML database connection string. See Korvus documentation for getting this set up. +KORVUS_DATABASE_URL=postgres://user:password@sql.cloud.postgresml.org:6432/korvus_database \ No newline at end of file diff --git a/korvus-simple-rag/korvus_rag.py b/korvus-simple-rag/korvus_rag.py new file mode 100644 index 0000000..fb41628 --- /dev/null +++ b/korvus-simple-rag/korvus_rag.py @@ -0,0 +1,61 @@ +from korvus import Collection, Pipeline +from datasets import load_dataset +from time import time +from dotenv import load_dotenv +from rich.console import Console +import asyncio +import os + + +async def main(): + load_dotenv() + os.environ["RUST_BACKTRACE"] = "1" + console = Console() + + # Initialize collection + collection = Collection("squad") + + # Create and add pipeline + pipeline = Pipeline( + "squadv1", + { + "text": { + "splitter": {"model": "recursive_character"}, + "semantic_search": {"model": "intfloat/e5-small-v2"}, + } + }, + ) + await collection.add_pipeline(pipeline) + + # Prep documents for upserting + data = load_dataset("squad", split="train") + data = data.to_pandas() + data = data.drop_duplicates(subset=["context"]) + documents = [ + {"id": r["id"], "text": r["context"], "title": r["title"]} + for r in data.to_dict(orient="records") + ] + + print(len(documents)) + + # Upsert documents + await collection.upsert_documents(documents[:200]) + + # Query for answer + query = "Who won more than 20 grammy awards?" + console.print("Querying for context ...") + start = time() + results = await collection.vector_search( + {"query": {"fields": {"text": {"query": query}}}, "limit": 5}, pipeline + ) + end = time() + console.print("\n Results for '%s' " % (query), style="bold") + console.print(results) + console.print("Query time = %0.3f" % (end - start)) + + # Archive collection + await collection.archive() + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/korvus-simple-rag/requirements.txt b/korvus-simple-rag/requirements.txt new file mode 100644 index 0000000..b485a8f --- /dev/null +++ b/korvus-simple-rag/requirements.txt @@ -0,0 +1,4 @@ +korvus==1.1.2 +asyncio==3.4.3 +python-dotenv==0.13.0 +datasets==3.0.0 \ No newline at end of file