feat: migrate store to source_id

2026-02-21 20:01:26 +00:00 · 2024-09-09 15:46:18 +01:00
parent 1bb81614a5
commit 2f9c72c1cf
10 changed files with 43 additions and 40 deletions
--- a/application/api/answer/routes.py
+++ b/application/api/answer/routes.py
@@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
 mongo = MongoClient(settings.MONGO_URI)
 db = mongo["docsgpt"]
 conversations_collection = db["conversations"]
-vectors_collection = db["vectors"]
+sources_collection = db["sources"]
 prompts_collection = db["prompts"]
 api_key_collection = db["api_keys"]
 answer = Blueprint("answer", __name__)
@@ -91,7 +91,7 @@ def get_data_from_api_key(api_key):


 def get_retriever(source_id: str):
-    doc = vectors_collection.find_one({"_id": ObjectId(source_id)})
+    doc = sources_collection.find_one({"_id": ObjectId(source_id)})
    if doc is None:
        raise Exception("Source document does not exist", 404)
    retriever_name = None if "retriever" not in doc else doc["retriever"]
--- a/application/api/internal/routes.py
+++ b/application/api/internal/routes.py
@@ -9,7 +9,7 @@ from application.core.settings import settings
 mongo = MongoClient(settings.MONGO_URI)
 db = mongo["docsgpt"]
 conversations_collection = db["conversations"]
-vectors_collection = db["vectors"]
+sources_collection = db["sources"]

 current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

@@ -60,8 +60,8 @@ def upload_index_files():
            os.makedirs(save_dir)
        file_faiss.save(os.path.join(save_dir, "index.faiss"))
        file_pkl.save(os.path.join(save_dir, "index.pkl"))
-    # create entry in vectors_collection
-    vectors_collection.insert_one(
+    # create entry in sources_collection
+    sources_collection.insert_one(
        {
            "_id": ObjectId(id),
            "user": user,
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -17,7 +17,7 @@ from application.vectorstore.vector_creator import VectorCreator
 mongo = MongoClient(settings.MONGO_URI)
 db = mongo["docsgpt"]
 conversations_collection = db["conversations"]
-vectors_collection = db["vectors"]
+sources_collection = db["sources"]
 prompts_collection = db["prompts"]
 feedback_collection = db["feedback"]
 api_key_collection = db["api_keys"]
@@ -106,7 +106,7 @@ def delete_by_ids():
        return {"status": "error"}

    if settings.VECTOR_STORE == "faiss":
-        result = vectors_collection.delete_index(ids=ids)
+        result = sources_collection.delete_index(ids=ids)
        if result:
            return {"status": "ok"}
    return {"status": "error"}
@@ -116,9 +116,9 @@ def delete_by_ids():
 def delete_old():
    """Delete old indexes."""
    import shutil
-    path = request.args.get("path")
-    doc = vectors_collection.find_one({
-        "_id": ObjectId(path),
+    source_id = request.args.get("source_id")
+    doc = sources_collection.find_one({
+        "_id": ObjectId(source_id),
        "user": "local",
    })
    if(doc is None):
@@ -129,10 +129,10 @@ def delete_old():
        except FileNotFoundError:
            pass
    else:
-        vetorstore = VectorCreator.create_vectorstore(settings.VECTOR_STORE, path=str(doc["_id"]))
+        vetorstore = VectorCreator.create_vectorstore(settings.VECTOR_STORE, source_id=str(doc["_id"]))
        vetorstore.delete_index()
-    vectors_collection.delete_one({
-        "_id": ObjectId(path),
+    sources_collection.delete_one({
+        "_id": ObjectId(source_id),
    })

    return {"status": "ok"}
@@ -244,8 +244,8 @@ def combined_json():
        }
    ]
    # structure: name, language, version, description, fullName, date, docLink
-    # append data from vectors_collection in sorted order in descending order of date
-    for index in vectors_collection.find({"user": user}).sort("date", -1):
+    # append data from sources_collection in sorted order in descending order of date
+    for index in sources_collection.find({"user": user}).sort("date", -1):
        data.append(
            {
                "id": str(index["_id"]),
--- a/application/parser/open_ai_func.py
+++ b/application/parser/open_ai_func.py
@@ -12,8 +12,8 @@ from retry import retry

@retry(tries=10, delay=60)
 def store_add_texts_with_retry(store, i, id):
-    # add store to the metadata 
-    i.metadata["store"] = str(id)
+    # add source_id to the metadata 
+    i.metadata["source_id"] = str(id)
    store.add_texts([i.page_content], metadatas=[i.metadata])
    # store_pine.add_texts([i.page_content], metadatas=[i.metadata])

@@ -34,13 +34,13 @@ def call_openai_api(docs, folder_name, id, task_status):
        store = VectorCreator.create_vectorstore(
            settings.VECTOR_STORE,
            docs_init=docs_init,
-            path=f"{folder_name}",
+            source_id=f"{folder_name}",
            embeddings_key=os.getenv("EMBEDDINGS_KEY"),
        )
    else:
        store = VectorCreator.create_vectorstore(
            settings.VECTOR_STORE,
-            path=str(id),
+            source_id=str(id),
            embeddings_key=os.getenv("EMBEDDINGS_KEY"),
        )
    # Uncomment for MPNet embeddings
--- a/application/vectorstore/elasticsearch.py
+++ b/application/vectorstore/elasticsearch.py
@@ -9,9 +9,9 @@ import elasticsearch
 class ElasticsearchStore(BaseVectorStore):
    _es_connection = None  # Class attribute to hold the Elasticsearch connection

-    def __init__(self, path, embeddings_key, index_name=settings.ELASTIC_INDEX):
+    def __init__(self, source_id, embeddings_key, index_name=settings.ELASTIC_INDEX):
        super().__init__()
-        self.path = path.replace("application/indexes/", "").rstrip("/")
+        self.source_id = source_id.replace("application/indexes/", "").rstrip("/")
        self.embeddings_key = embeddings_key
        self.index_name = index_name
        
@@ -81,7 +81,7 @@ class ElasticsearchStore(BaseVectorStore):
        embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
        vector = embeddings.embed_query(question)
        knn = {
-            "filter": [{"match": {"metadata.store.keyword": self.path}}],
+            "filter": [{"match": {"metadata.source_id.keyword": self.source_id}}],
            "field": "vector",
            "k": k,
            "num_candidates": 100,
@@ -100,7 +100,7 @@ class ElasticsearchStore(BaseVectorStore):
                            }
                        }
                    ],
-                    "filter": [{"match": {"metadata.store.keyword": self.path}}],
+                    "filter": [{"match": {"metadata.source_id.keyword": self.source_id}}],
                }
            },
            "rank": {"rrf": {}},
@@ -209,4 +209,4 @@ class ElasticsearchStore(BaseVectorStore):

    def delete_index(self):
        self._es_connection.delete_by_query(index=self.index_name, query={"match": {
-                                      "metadata.store.keyword": self.path}},)
+                                      "metadata.source_id.keyword": self.source_id}},)
--- a/application/vectorstore/mongodb.py
+++ b/application/vectorstore/mongodb.py
@@ -5,7 +5,7 @@ from application.vectorstore.document_class import Document
 class MongoDBVectorStore(BaseVectorStore):
    def __init__(
        self,
-        path: str = "",
+        source_id: str = "",
        embeddings_key: str = "embeddings",
        collection: str = "documents",
        index_name: str = "vector_search_index",
@@ -18,7 +18,7 @@ class MongoDBVectorStore(BaseVectorStore):
        self._embedding_key = embedding_key
        self._embeddings_key = embeddings_key
        self._mongo_uri = settings.MONGO_URI
-        self._path = path.replace("application/indexes/", "").rstrip("/")
+        self._source_id = source_id.replace("application/indexes/", "").rstrip("/")
        self._embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)

        try:
@@ -46,7 +46,7 @@ class MongoDBVectorStore(BaseVectorStore):
                    "numCandidates": k * 10, 
                    "index": self._index_name,
                    "filter": {
-                        "store": {"$eq": self._path}
+                        "source_id": {"$eq": self._source_id}
                    }
                }
            }
@@ -123,4 +123,4 @@ class MongoDBVectorStore(BaseVectorStore):
        return result_ids
    
    def delete_index(self, *args, **kwargs):
-        self._collection.delete_many({"store": self._path})
+        self._collection.delete_many({"source_id": self._source_id})
--- a/application/vectorstore/qdrant.py
+++ b/application/vectorstore/qdrant.py
@@ -5,12 +5,12 @@ from qdrant_client import models


 class QdrantStore(BaseVectorStore):
-    def __init__(self, path: str = "", embeddings_key: str = "embeddings"):
+    def __init__(self, source_id: str = "", embeddings_key: str = "embeddings"):
        self._filter = models.Filter(
            must=[
                models.FieldCondition(
-                    key="metadata.store",
-                    match=models.MatchValue(value=path.replace("application/indexes/", "").rstrip("/")),
+                    key="metadata.source_id",
+                    match=models.MatchValue(value=source_id.replace("application/indexes/", "").rstrip("/")),
                )
            ]
        )
--- a/application/worker.py
+++ b/application/worker.py
@@ -145,7 +145,6 @@ def ingest_worker(self, directory, formats, name_job, filename, user, retriever=
            "file_pkl": open(full_path + "/index.pkl", "rb"),
        }
        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
-        response = requests.get(urljoin(settings.API_URL, "/api/delete_old?name=" + name_job + "&?user=" + user))
    else:
        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)

@@ -197,7 +196,6 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp", r
        }

        requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
-        requests.get(urljoin(settings.API_URL, "/api/delete_old?name=" + name_job + "&?user=" + user))
    else:
        requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)