Korvus RAG testing... cool example of using a tool for easy RAG with Python

2026-01-19 21:40:32 +00:00 · 2024-09-20 08:38:23 -05:00
parent 13a1969254
commit 9a3111d578
3 changed files with 69 additions and 0 deletions
--- a/korvus-simple-rag/.env.example
+++ b/korvus-simple-rag/.env.example
@@ -0,0 +1,4 @@
+# Rename this file to .env once you have filled in the below environment variables!
+
+# PostgresML database connection string. See Korvus documentation for getting this set up.
+KORVUS_DATABASE_URL=postgres://user:password@sql.cloud.postgresml.org:6432/korvus_database
--- a/korvus-simple-rag/korvus_rag.py
+++ b/korvus-simple-rag/korvus_rag.py
@@ -0,0 +1,61 @@
+from korvus import Collection, Pipeline
+from datasets import load_dataset
+from time import time
+from dotenv import load_dotenv
+from rich.console import Console
+import asyncio
+import os
+
+
+async def main():
+    load_dotenv()
+    os.environ["RUST_BACKTRACE"] = "1"
+    console = Console()
+
+    # Initialize collection
+    collection = Collection("squad")
+
+    # Create and add pipeline
+    pipeline = Pipeline(
+        "squadv1",
+        {
+            "text": {
+                "splitter": {"model": "recursive_character"},
+                "semantic_search": {"model": "intfloat/e5-small-v2"},
+            }
+        },
+    )
+    await collection.add_pipeline(pipeline)
+
+    # Prep documents for upserting
+    data = load_dataset("squad", split="train")
+    data = data.to_pandas()
+    data = data.drop_duplicates(subset=["context"])
+    documents = [
+        {"id": r["id"], "text": r["context"], "title": r["title"]}
+        for r in data.to_dict(orient="records")
+    ]
+
+    print(len(documents))
+
+    # Upsert documents
+    await collection.upsert_documents(documents[:200])
+
+    # Query for answer
+    query = "Who won more than 20 grammy awards?"
+    console.print("Querying for context ...")
+    start = time()
+    results = await collection.vector_search(
+        {"query": {"fields": {"text": {"query": query}}}, "limit": 5}, pipeline
+    )
+    end = time()
+    console.print("\n Results for '%s' " % (query), style="bold")
+    console.print(results)
+    console.print("Query time = %0.3f" % (end - start))
+
+    # Archive collection
+    await collection.archive()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/korvus-simple-rag/requirements.txt
+++ b/korvus-simple-rag/requirements.txt
@@ -0,0 +1,4 @@
+korvus==1.1.2
+asyncio==3.4.3
+python-dotenv==0.13.0
+datasets==3.0.0