feat: migrate store to source_id

This commit is contained in:
Alex
2024-09-09 15:46:18 +01:00
parent 1bb81614a5
commit 2f9c72c1cf
10 changed files with 43 additions and 40 deletions

View File

@@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
mongo = MongoClient(settings.MONGO_URI)
db = mongo["docsgpt"]
conversations_collection = db["conversations"]
vectors_collection = db["vectors"]
sources_collection = db["sources"]
prompts_collection = db["prompts"]
api_key_collection = db["api_keys"]
answer = Blueprint("answer", __name__)
@@ -91,7 +91,7 @@ def get_data_from_api_key(api_key):
def get_retriever(source_id: str):
doc = vectors_collection.find_one({"_id": ObjectId(source_id)})
doc = sources_collection.find_one({"_id": ObjectId(source_id)})
if doc is None:
raise Exception("Source document does not exist", 404)
retriever_name = None if "retriever" not in doc else doc["retriever"]

View File

@@ -9,7 +9,7 @@ from application.core.settings import settings
mongo = MongoClient(settings.MONGO_URI)
db = mongo["docsgpt"]
conversations_collection = db["conversations"]
vectors_collection = db["vectors"]
sources_collection = db["sources"]
current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -60,8 +60,8 @@ def upload_index_files():
os.makedirs(save_dir)
file_faiss.save(os.path.join(save_dir, "index.faiss"))
file_pkl.save(os.path.join(save_dir, "index.pkl"))
# create entry in vectors_collection
vectors_collection.insert_one(
# create entry in sources_collection
sources_collection.insert_one(
{
"_id": ObjectId(id),
"user": user,

View File

@@ -17,7 +17,7 @@ from application.vectorstore.vector_creator import VectorCreator
mongo = MongoClient(settings.MONGO_URI)
db = mongo["docsgpt"]
conversations_collection = db["conversations"]
vectors_collection = db["vectors"]
sources_collection = db["sources"]
prompts_collection = db["prompts"]
feedback_collection = db["feedback"]
api_key_collection = db["api_keys"]
@@ -106,7 +106,7 @@ def delete_by_ids():
return {"status": "error"}
if settings.VECTOR_STORE == "faiss":
result = vectors_collection.delete_index(ids=ids)
result = sources_collection.delete_index(ids=ids)
if result:
return {"status": "ok"}
return {"status": "error"}
@@ -116,9 +116,9 @@ def delete_by_ids():
def delete_old():
"""Delete old indexes."""
import shutil
path = request.args.get("path")
doc = vectors_collection.find_one({
"_id": ObjectId(path),
source_id = request.args.get("source_id")
doc = sources_collection.find_one({
"_id": ObjectId(source_id),
"user": "local",
})
if(doc is None):
@@ -129,10 +129,10 @@ def delete_old():
except FileNotFoundError:
pass
else:
vetorstore = VectorCreator.create_vectorstore(settings.VECTOR_STORE, path=str(doc["_id"]))
vetorstore = VectorCreator.create_vectorstore(settings.VECTOR_STORE, source_id=str(doc["_id"]))
vetorstore.delete_index()
vectors_collection.delete_one({
"_id": ObjectId(path),
sources_collection.delete_one({
"_id": ObjectId(source_id),
})
return {"status": "ok"}
@@ -244,8 +244,8 @@ def combined_json():
}
]
# structure: name, language, version, description, fullName, date, docLink
# append data from vectors_collection in sorted order in descending order of date
for index in vectors_collection.find({"user": user}).sort("date", -1):
# append data from sources_collection in sorted order in descending order of date
for index in sources_collection.find({"user": user}).sort("date", -1):
data.append(
{
"id": str(index["_id"]),

View File

@@ -12,8 +12,8 @@ from retry import retry
@retry(tries=10, delay=60)
def store_add_texts_with_retry(store, i, id):
# add store to the metadata
i.metadata["store"] = str(id)
# add source_id to the metadata
i.metadata["source_id"] = str(id)
store.add_texts([i.page_content], metadatas=[i.metadata])
# store_pine.add_texts([i.page_content], metadatas=[i.metadata])
@@ -34,13 +34,13 @@ def call_openai_api(docs, folder_name, id, task_status):
store = VectorCreator.create_vectorstore(
settings.VECTOR_STORE,
docs_init=docs_init,
path=f"{folder_name}",
source_id=f"{folder_name}",
embeddings_key=os.getenv("EMBEDDINGS_KEY"),
)
else:
store = VectorCreator.create_vectorstore(
settings.VECTOR_STORE,
path=str(id),
source_id=str(id),
embeddings_key=os.getenv("EMBEDDINGS_KEY"),
)
# Uncomment for MPNet embeddings

View File

@@ -9,9 +9,9 @@ import elasticsearch
class ElasticsearchStore(BaseVectorStore):
_es_connection = None # Class attribute to hold the Elasticsearch connection
def __init__(self, path, embeddings_key, index_name=settings.ELASTIC_INDEX):
def __init__(self, source_id, embeddings_key, index_name=settings.ELASTIC_INDEX):
super().__init__()
self.path = path.replace("application/indexes/", "").rstrip("/")
self.source_id = source_id.replace("application/indexes/", "").rstrip("/")
self.embeddings_key = embeddings_key
self.index_name = index_name
@@ -81,7 +81,7 @@ class ElasticsearchStore(BaseVectorStore):
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
vector = embeddings.embed_query(question)
knn = {
"filter": [{"match": {"metadata.store.keyword": self.path}}],
"filter": [{"match": {"metadata.source_id.keyword": self.source_id}}],
"field": "vector",
"k": k,
"num_candidates": 100,
@@ -100,7 +100,7 @@ class ElasticsearchStore(BaseVectorStore):
}
}
],
"filter": [{"match": {"metadata.store.keyword": self.path}}],
"filter": [{"match": {"metadata.source_id.keyword": self.source_id}}],
}
},
"rank": {"rrf": {}},
@@ -209,4 +209,4 @@ class ElasticsearchStore(BaseVectorStore):
def delete_index(self):
self._es_connection.delete_by_query(index=self.index_name, query={"match": {
"metadata.store.keyword": self.path}},)
"metadata.source_id.keyword": self.source_id}},)

View File

@@ -5,7 +5,7 @@ from application.vectorstore.document_class import Document
class MongoDBVectorStore(BaseVectorStore):
def __init__(
self,
path: str = "",
source_id: str = "",
embeddings_key: str = "embeddings",
collection: str = "documents",
index_name: str = "vector_search_index",
@@ -18,7 +18,7 @@ class MongoDBVectorStore(BaseVectorStore):
self._embedding_key = embedding_key
self._embeddings_key = embeddings_key
self._mongo_uri = settings.MONGO_URI
self._path = path.replace("application/indexes/", "").rstrip("/")
self._source_id = source_id.replace("application/indexes/", "").rstrip("/")
self._embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
try:
@@ -46,7 +46,7 @@ class MongoDBVectorStore(BaseVectorStore):
"numCandidates": k * 10,
"index": self._index_name,
"filter": {
"store": {"$eq": self._path}
"source_id": {"$eq": self._source_id}
}
}
}
@@ -123,4 +123,4 @@ class MongoDBVectorStore(BaseVectorStore):
return result_ids
def delete_index(self, *args, **kwargs):
self._collection.delete_many({"store": self._path})
self._collection.delete_many({"source_id": self._source_id})

View File

@@ -5,12 +5,12 @@ from qdrant_client import models
class QdrantStore(BaseVectorStore):
def __init__(self, path: str = "", embeddings_key: str = "embeddings"):
def __init__(self, source_id: str = "", embeddings_key: str = "embeddings"):
self._filter = models.Filter(
must=[
models.FieldCondition(
key="metadata.store",
match=models.MatchValue(value=path.replace("application/indexes/", "").rstrip("/")),
key="metadata.source_id",
match=models.MatchValue(value=source_id.replace("application/indexes/", "").rstrip("/")),
)
]
)

View File

@@ -145,7 +145,6 @@ def ingest_worker(self, directory, formats, name_job, filename, user, retriever=
"file_pkl": open(full_path + "/index.pkl", "rb"),
}
response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
response = requests.get(urljoin(settings.API_URL, "/api/delete_old?name=" + name_job + "&?user=" + user))
else:
response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
@@ -197,7 +196,6 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp", r
}
requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
requests.get(urljoin(settings.API_URL, "/api/delete_old?name=" + name_job + "&?user=" + user))
else:
requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)