mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-12-01 09:33:14 +00:00
auto-rag
Need vectorstores testing for all except faiss
This commit is contained in:
@@ -58,6 +58,10 @@ class BaseVectorStore(ABC):
|
||||
def search(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def is_azure_configured(self):
|
||||
return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
|
||||
|
||||
|
||||
@@ -108,6 +108,46 @@ class ElasticsearchStore(BaseVectorStore):
|
||||
|
||||
doc_list.append(Document(page_content = hit['_source']['text'], metadata = hit['_source']['metadata']))
|
||||
return doc_list
|
||||
|
||||
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
|
||||
vector = embeddings.embed_query(query)
|
||||
knn = {
|
||||
"filter": [{"match": {"metadata.source_id.keyword": self.source_id}}],
|
||||
"field": "vector",
|
||||
"k": k,
|
||||
"num_candidates": 100,
|
||||
"query_vector": vector,
|
||||
}
|
||||
full_query = {
|
||||
"knn": knn,
|
||||
"query": {
|
||||
"bool": {
|
||||
"must": [
|
||||
{
|
||||
"match": {
|
||||
"text": {
|
||||
"query": question,
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"filter": [{"match": {"metadata.source_id.keyword": self.source_id}}],
|
||||
}
|
||||
},
|
||||
"rank": {"rrf": {}},
|
||||
}
|
||||
resp = self.docsearch.search(index=self.index_name, query=full_query['query'], size=k, knn=full_query['knn'])
|
||||
|
||||
docs_with_scores = []
|
||||
for hit in resp['hits']['hits']:
|
||||
score = hit['_score']
|
||||
# Normalize the score. Elasticsearch returns a score of 1.0 + cosine similarity.
|
||||
similarity = max(0, score - 1.0)
|
||||
doc = Document(page_content=hit['_source']['text'], metadata=hit['_source']['metadata'])
|
||||
docs_with_scores.append((doc, similarity))
|
||||
|
||||
return docs_with_scores
|
||||
|
||||
def _create_index_if_not_exists(
|
||||
self, index_name, dims_length
|
||||
|
||||
@@ -62,6 +62,18 @@ class FaissStore(BaseVectorStore):
|
||||
|
||||
def search(self, *args, **kwargs):
|
||||
return self.docsearch.similarity_search(*args, **kwargs)
|
||||
|
||||
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||
docs_and_distances = self.docsearch.similarity_search_with_score(query, k, *args, **kwargs)
|
||||
|
||||
# Convert L2 distance to a normalized similarity score (0-1, higher is better)
|
||||
docs_and_similarities = []
|
||||
for doc, distance in docs_and_distances:
|
||||
if distance < 0: distance = 0
|
||||
similarity = 1 / (1 + distance)
|
||||
docs_and_similarities.append((doc, similarity))
|
||||
|
||||
return docs_and_similarities
|
||||
|
||||
def add_texts(self, *args, **kwargs):
|
||||
return self.docsearch.add_texts(*args, **kwargs)
|
||||
|
||||
@@ -2,6 +2,8 @@ from typing import List, Optional
|
||||
import importlib
|
||||
from application.vectorstore.base import BaseVectorStore
|
||||
from application.core.settings import settings
|
||||
from application.vectorstore.document_class import Document
|
||||
|
||||
|
||||
class LanceDBVectorStore(BaseVectorStore):
|
||||
"""Class for LanceDB Vector Store integration."""
|
||||
@@ -87,6 +89,23 @@ class LanceDBVectorStore(BaseVectorStore):
|
||||
results = self.docsearch.search(query_embedding).limit(k).to_list()
|
||||
return [(result["_distance"], result["text"], result["metadata"]) for result in results]
|
||||
|
||||
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||
"""Perform a similarity search with scores."""
|
||||
self.ensure_table_exists()
|
||||
query_embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_query(query)
|
||||
results = self.docsearch.search(query_embedding).limit(k).to_list()
|
||||
|
||||
docs_with_scores = []
|
||||
for result in results:
|
||||
distance = result.get('_distance', float('inf'))
|
||||
if distance < 0: distance = 0
|
||||
# Convert L2 distance to a normalized similarity score
|
||||
similarity = 1 / (1 + distance)
|
||||
doc = Document(page_content=result['text'], metadata=result["metadata"])
|
||||
docs_with_scores.append((doc, similarity))
|
||||
|
||||
return docs_with_scores
|
||||
|
||||
def delete_index(self):
|
||||
"""Delete the entire LanceDB index (table)."""
|
||||
if self.table:
|
||||
|
||||
@@ -25,6 +25,16 @@ class MilvusStore(BaseVectorStore):
|
||||
def search(self, question, k=2, *args, **kwargs):
|
||||
expr = f"source_id == '{self._source_id}'"
|
||||
return self._docsearch.similarity_search(query=question, k=k, expr=expr, *args, **kwargs)
|
||||
|
||||
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||
expr = f"source_id == '{self._source_id}'"
|
||||
docs_and_distances = self._docsearch.similarity_search_with_score(query, k, expr=expr, *args, **kwargs)
|
||||
docs_with_scores = []
|
||||
for doc, distance in docs_and_distances:
|
||||
similarity = 1.0 - distance
|
||||
docs_with_scores.append((doc, max(0, similarity)))
|
||||
|
||||
return docs_with_scores
|
||||
|
||||
def add_texts(self, texts: List[str], metadatas: Optional[List[dict]], *args, **kwargs):
|
||||
ids = [str(uuid4()) for _ in range(len(texts))]
|
||||
|
||||
@@ -62,6 +62,40 @@ class MongoDBVectorStore(BaseVectorStore):
|
||||
metadata = doc
|
||||
results.append(Document(text, metadata))
|
||||
return results
|
||||
|
||||
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||
query_vector = self._embedding.embed_query(query)
|
||||
|
||||
pipeline = [
|
||||
{
|
||||
"$vectorSearch": {
|
||||
"queryVector": query_vector,
|
||||
"path": self._embedding_key,
|
||||
"limit": k,
|
||||
"numCandidates": k * 10,
|
||||
"index": self._index_name,
|
||||
"filter": {"source_id": {"$eq": self._source_id}},
|
||||
}
|
||||
},
|
||||
{
|
||||
"$addFields": {
|
||||
"score": {"$meta": "vectorSearchScore"}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
cursor = self._collection.aggregate(pipeline)
|
||||
|
||||
results = []
|
||||
for doc in cursor:
|
||||
score = doc.pop("score", 0.0)
|
||||
text = doc.pop(self._text_key)
|
||||
doc.pop("_id")
|
||||
doc.pop(self._embedding_key, None)
|
||||
metadata = doc
|
||||
doc = Document(page_content=text, metadata=metadata)
|
||||
results.append((doc, score))
|
||||
return results
|
||||
|
||||
def _insert_texts(self, texts, metadatas):
|
||||
if not texts:
|
||||
|
||||
@@ -35,6 +35,9 @@ class QdrantStore(BaseVectorStore):
|
||||
|
||||
def search(self, *args, **kwargs):
|
||||
return self._docsearch.similarity_search(filter=self._filter, *args, **kwargs)
|
||||
|
||||
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||
return self._docsearch.similarity_search_with_score(query=query, k=k, filter=self._filter, *args, **kwargs)
|
||||
|
||||
def add_texts(self, *args, **kwargs):
|
||||
return self._docsearch.add_texts(*args, **kwargs)
|
||||
|
||||
Reference in New Issue
Block a user