mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 00:23:17 +00:00
@@ -1,7 +1,7 @@
|
||||
import logging
|
||||
import traceback
|
||||
|
||||
from flask import make_response, request, Response
|
||||
from flask import request, Response
|
||||
from flask_restx import fields, Resource
|
||||
|
||||
from application.api import api
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import logging
|
||||
from application.vectorstore.base import BaseVectorStore
|
||||
from application.core.settings import settings
|
||||
from application.vectorstore.document_class import Document
|
||||
|
||||
|
||||
class QdrantStore(BaseVectorStore):
|
||||
@@ -7,18 +9,22 @@ class QdrantStore(BaseVectorStore):
|
||||
from qdrant_client import models
|
||||
from langchain_community.vectorstores.qdrant import Qdrant
|
||||
|
||||
# Store the source_id for use in add_chunk
|
||||
self._source_id = str(source_id).replace("application/indexes/", "").rstrip("/")
|
||||
|
||||
self._filter = models.Filter(
|
||||
must=[
|
||||
models.FieldCondition(
|
||||
key="metadata.source_id",
|
||||
match=models.MatchValue(value=source_id.replace("application/indexes/", "").rstrip("/")),
|
||||
match=models.MatchValue(value=self._source_id),
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
embedding=self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
|
||||
self._docsearch = Qdrant.construct_instance(
|
||||
["TEXT_TO_OBTAIN_EMBEDDINGS_DIMENSION"],
|
||||
embedding=self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key),
|
||||
embedding=embedding,
|
||||
collection_name=settings.QDRANT_COLLECTION_NAME,
|
||||
location=settings.QDRANT_LOCATION,
|
||||
url=settings.QDRANT_URL,
|
||||
@@ -32,6 +38,32 @@ class QdrantStore(BaseVectorStore):
|
||||
path=settings.QDRANT_PATH,
|
||||
distance_func=settings.QDRANT_DISTANCE_FUNC,
|
||||
)
|
||||
try:
|
||||
collections = self._docsearch.client.get_collections()
|
||||
collection_exists = settings.QDRANT_COLLECTION_NAME in [
|
||||
collection.name for collection in collections.collections
|
||||
]
|
||||
|
||||
if not collection_exists:
|
||||
self._docsearch.client.recreate_collection(
|
||||
collection_name=settings.QDRANT_COLLECTION_NAME,
|
||||
vectors_config=models.VectorParams(size=embedding.client[1].word_embedding_dimension, distance=models.Distance.COSINE),
|
||||
)
|
||||
|
||||
# Ensure the required index exists for metadata.source_id
|
||||
try:
|
||||
self._docsearch.client.create_payload_index(
|
||||
collection_name=settings.QDRANT_COLLECTION_NAME,
|
||||
field_name="metadata.source_id",
|
||||
field_schema=models.PayloadSchemaType.KEYWORD,
|
||||
)
|
||||
except Exception as index_error:
|
||||
# Index might already exist, which is fine
|
||||
if "already exists" not in str(index_error).lower():
|
||||
logging.warning(f"Could not create index for metadata.source_id: {index_error}")
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Could not check for collection: {e}")
|
||||
|
||||
def search(self, *args, **kwargs):
|
||||
return self._docsearch.similarity_search(filter=self._filter, *args, **kwargs)
|
||||
@@ -46,3 +78,59 @@ class QdrantStore(BaseVectorStore):
|
||||
return self._docsearch.client.delete(
|
||||
collection_name=settings.QDRANT_COLLECTION_NAME, points_selector=self._filter
|
||||
)
|
||||
|
||||
def get_chunks(self):
|
||||
try:
|
||||
|
||||
chunks = []
|
||||
offset = None
|
||||
while True:
|
||||
records, offset = self._docsearch.client.scroll(
|
||||
collection_name=settings.QDRANT_COLLECTION_NAME,
|
||||
scroll_filter=self._filter,
|
||||
limit=10,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=offset,
|
||||
)
|
||||
for record in records:
|
||||
doc_id = record.id
|
||||
text = record.payload.get("page_content")
|
||||
metadata = record.payload.get("metadata")
|
||||
chunks.append(
|
||||
{"doc_id": doc_id, "text": text, "metadata": metadata}
|
||||
)
|
||||
if offset is None:
|
||||
break
|
||||
return chunks
|
||||
except Exception as e:
|
||||
logging.error(f"Error getting chunks: {e}", exc_info=True)
|
||||
return []
|
||||
|
||||
def add_chunk(self, text, metadata=None):
|
||||
import uuid
|
||||
metadata = metadata or {}
|
||||
|
||||
# Create a copy to avoid modifying the original metadata
|
||||
final_metadata = metadata.copy()
|
||||
|
||||
# Ensure the source_id is in the metadata so the chunk can be found by filters
|
||||
final_metadata["source_id"] = self._source_id
|
||||
|
||||
doc = Document(page_content=text, metadata=final_metadata)
|
||||
# Generate a unique ID for the document
|
||||
doc_id = str(uuid.uuid4())
|
||||
doc.id = doc_id
|
||||
doc_ids = self._docsearch.add_documents([doc])
|
||||
return doc_ids[0] if doc_ids else doc_id
|
||||
|
||||
def delete_chunk(self, chunk_id):
|
||||
try:
|
||||
self._docsearch.client.delete(
|
||||
collection_name=settings.QDRANT_COLLECTION_NAME,
|
||||
points_selector=[chunk_id],
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.error(f"Error deleting chunk: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user