mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-05-07 14:34:32 +00:00
Compare commits
1 Commits
dependabot
...
auto-chunk
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
01ea90f39a |
@@ -446,7 +446,8 @@ class Stream(Resource):
|
|||||||
attachment_ids = data.get("attachments", [])
|
attachment_ids = data.get("attachments", [])
|
||||||
|
|
||||||
index = data.get("index", None)
|
index = data.get("index", None)
|
||||||
chunks = int(data.get("chunks", 2))
|
chunks_from_request = data.get("chunks", 2)
|
||||||
|
chunks = chunks_from_request if str(chunks_from_request) == 'Auto' else int(chunks_from_request)
|
||||||
token_limit = data.get("token_limit", settings.DEFAULT_MAX_HISTORY)
|
token_limit = data.get("token_limit", settings.DEFAULT_MAX_HISTORY)
|
||||||
retriever_name = data.get("retriever", "classic")
|
retriever_name = data.get("retriever", "classic")
|
||||||
agent_id = data.get("agent_id", None)
|
agent_id = data.get("agent_id", None)
|
||||||
@@ -620,7 +621,8 @@ class Answer(Resource):
|
|||||||
)
|
)
|
||||||
conversation_id = data.get("conversation_id")
|
conversation_id = data.get("conversation_id")
|
||||||
prompt_id = data.get("prompt_id", "default")
|
prompt_id = data.get("prompt_id", "default")
|
||||||
chunks = int(data.get("chunks", 2))
|
chunks_from_request = data.get("chunks", 2)
|
||||||
|
chunks = chunks_from_request if str(chunks_from_request) == 'Auto' else int(chunks_from_request)
|
||||||
token_limit = data.get("token_limit", settings.DEFAULT_MAX_HISTORY)
|
token_limit = data.get("token_limit", settings.DEFAULT_MAX_HISTORY)
|
||||||
retriever_name = data.get("retriever", "classic")
|
retriever_name = data.get("retriever", "classic")
|
||||||
agent_type = settings.AGENT_NAME
|
agent_type = settings.AGENT_NAME
|
||||||
@@ -814,7 +816,8 @@ class Search(Resource):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
question = data["question"]
|
question = data["question"]
|
||||||
chunks = int(data.get("chunks", 2))
|
chunks_from_request = data.get("chunks", 2)
|
||||||
|
chunks = chunks_from_request if str(chunks_from_request) == 'Auto' else int(chunks_from_request)
|
||||||
token_limit = data.get("token_limit", settings.DEFAULT_MAX_HISTORY)
|
token_limit = data.get("token_limit", settings.DEFAULT_MAX_HISTORY)
|
||||||
retriever_name = data.get("retriever", "classic")
|
retriever_name = data.get("retriever", "classic")
|
||||||
|
|
||||||
|
|||||||
@@ -2,11 +2,16 @@ import logging
|
|||||||
from application.core.settings import settings
|
from application.core.settings import settings
|
||||||
from application.llm.llm_creator import LLMCreator
|
from application.llm.llm_creator import LLMCreator
|
||||||
from application.retriever.base import BaseRetriever
|
from application.retriever.base import BaseRetriever
|
||||||
|
|
||||||
from application.vectorstore.vector_creator import VectorCreator
|
from application.vectorstore.vector_creator import VectorCreator
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class ClassicRAG(BaseRetriever):
|
class ClassicRAG(BaseRetriever):
|
||||||
|
# Settings for Auto-Chunking
|
||||||
|
AUTO_CHUNK_MIN: int = 0
|
||||||
|
AUTO_CHUNK_MAX: int = 10
|
||||||
|
SIMILARITY_SCORE_THRESHOLD: float = 0.5
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
source,
|
source,
|
||||||
@@ -47,6 +52,7 @@ class ClassicRAG(BaseRetriever):
|
|||||||
self.question = self._rephrase_query()
|
self.question = self._rephrase_query()
|
||||||
self.vectorstore = source["active_docs"] if "active_docs" in source else None
|
self.vectorstore = source["active_docs"] if "active_docs" in source else None
|
||||||
self.decoded_token = decoded_token
|
self.decoded_token = decoded_token
|
||||||
|
self.actual_chunks_retrieved = 0
|
||||||
|
|
||||||
def _rephrase_query(self):
|
def _rephrase_query(self):
|
||||||
if (
|
if (
|
||||||
@@ -77,8 +83,66 @@ class ClassicRAG(BaseRetriever):
|
|||||||
return self.original_question
|
return self.original_question
|
||||||
|
|
||||||
def _get_data(self):
|
def _get_data(self):
|
||||||
|
if self.chunks == 'Auto':
|
||||||
|
return self._get_data_auto()
|
||||||
|
else:
|
||||||
|
return self._get_data_classic()
|
||||||
|
|
||||||
|
def _get_data_auto(self):
|
||||||
|
if not self.vectorstore:
|
||||||
|
self.actual_chunks_retrieved = 0
|
||||||
|
return []
|
||||||
|
|
||||||
|
docsearch = VectorCreator.create_vectorstore(
|
||||||
|
settings.VECTOR_STORE, self.vectorstore, settings.EMBEDDINGS_KEY
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
docs_with_scores = docsearch.search_with_scores(self.question, k=self.AUTO_CHUNK_MAX)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during search_with_scores: {e}", exc_info=True)
|
||||||
|
self.actual_chunks_retrieved = 0
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not docs_with_scores:
|
||||||
|
self.actual_chunks_retrieved = 0
|
||||||
|
return []
|
||||||
|
|
||||||
|
candidate_docs = []
|
||||||
|
for doc, score in docs_with_scores:
|
||||||
|
if score >= self.SIMILARITY_SCORE_THRESHOLD:
|
||||||
|
candidate_docs.append(doc)
|
||||||
|
|
||||||
|
if len(candidate_docs) < self.AUTO_CHUNK_MIN and self.AUTO_CHUNK_MIN > 0:
|
||||||
|
final_docs_to_format = [doc for doc, score in docs_with_scores[:self.AUTO_CHUNK_MIN]]
|
||||||
|
else:
|
||||||
|
final_docs_to_format = candidate_docs
|
||||||
|
|
||||||
|
self.actual_chunks_retrieved = len(final_docs_to_format)
|
||||||
|
|
||||||
|
if not final_docs_to_format:
|
||||||
|
return []
|
||||||
|
|
||||||
|
formatted_docs = [
|
||||||
|
{
|
||||||
|
"title": i.metadata.get(
|
||||||
|
"title", i.metadata.get("post_title", i.page_content)
|
||||||
|
).split("/")[-1],
|
||||||
|
"text": i.page_content,
|
||||||
|
"source": (
|
||||||
|
i.metadata.get("source")
|
||||||
|
if i.metadata.get("source")
|
||||||
|
else "local"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
for i in final_docs_to_format
|
||||||
|
]
|
||||||
|
logger.info(f"AutoRAG: Retrieved {self.actual_chunks_retrieved} chunks for query '{self.original_question}'.")
|
||||||
|
return formatted_docs
|
||||||
|
|
||||||
|
def _get_data_classic(self):
|
||||||
if self.chunks == 0:
|
if self.chunks == 0:
|
||||||
docs = []
|
return []
|
||||||
else:
|
else:
|
||||||
docsearch = VectorCreator.create_vectorstore(
|
docsearch = VectorCreator.create_vectorstore(
|
||||||
settings.VECTOR_STORE, self.vectorstore, settings.EMBEDDINGS_KEY
|
settings.VECTOR_STORE, self.vectorstore, settings.EMBEDDINGS_KEY
|
||||||
@@ -98,8 +162,7 @@ class ClassicRAG(BaseRetriever):
|
|||||||
}
|
}
|
||||||
for i in docs_temp
|
for i in docs_temp
|
||||||
]
|
]
|
||||||
|
return docs
|
||||||
return docs
|
|
||||||
|
|
||||||
def gen():
|
def gen():
|
||||||
pass
|
pass
|
||||||
@@ -111,12 +174,24 @@ class ClassicRAG(BaseRetriever):
|
|||||||
return self._get_data()
|
return self._get_data()
|
||||||
|
|
||||||
def get_params(self):
|
def get_params(self):
|
||||||
return {
|
params = {
|
||||||
"question": self.original_question,
|
"question": self.original_question,
|
||||||
"rephrased_question": self.question,
|
"rephrased_question": self.question,
|
||||||
"source": self.vectorstore,
|
"source": self.vectorstore,
|
||||||
"chunks": self.chunks,
|
|
||||||
"token_limit": self.token_limit,
|
"token_limit": self.token_limit,
|
||||||
"gpt_model": self.gpt_model,
|
"gpt_model": self.gpt_model,
|
||||||
"user_api_key": self.user_api_key,
|
"user_api_key": self.user_api_key,
|
||||||
}
|
}
|
||||||
|
if self.chunks == 'Auto':
|
||||||
|
params.update({
|
||||||
|
"chunks_mode": "Auto",
|
||||||
|
"chunks_retrieved_auto": self.actual_chunks_retrieved,
|
||||||
|
"auto_chunk_min_setting": self.AUTO_CHUNK_MIN,
|
||||||
|
"auto_chunk_max_setting": self.AUTO_CHUNK_MAX,
|
||||||
|
"similarity_threshold_setting": self.SIMILARITY_SCORE_THRESHOLD,
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
params["chunks_mode"] = "Classic"
|
||||||
|
params["chunks"] = self.chunks
|
||||||
|
|
||||||
|
return params
|
||||||
@@ -2,7 +2,6 @@ from application.retriever.classic_rag import ClassicRAG
|
|||||||
from application.retriever.duckduck_search import DuckDuckSearch
|
from application.retriever.duckduck_search import DuckDuckSearch
|
||||||
from application.retriever.brave_search import BraveRetSearch
|
from application.retriever.brave_search import BraveRetSearch
|
||||||
|
|
||||||
|
|
||||||
class RetrieverCreator:
|
class RetrieverCreator:
|
||||||
retrievers = {
|
retrievers = {
|
||||||
"classic": ClassicRAG,
|
"classic": ClassicRAG,
|
||||||
|
|||||||
@@ -58,6 +58,10 @@ class BaseVectorStore(ABC):
|
|||||||
def search(self, *args, **kwargs):
|
def search(self, *args, **kwargs):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
def is_azure_configured(self):
|
def is_azure_configured(self):
|
||||||
return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
|
return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
|
||||||
|
|
||||||
|
|||||||
@@ -109,6 +109,46 @@ class ElasticsearchStore(BaseVectorStore):
|
|||||||
doc_list.append(Document(page_content = hit['_source']['text'], metadata = hit['_source']['metadata']))
|
doc_list.append(Document(page_content = hit['_source']['text'], metadata = hit['_source']['metadata']))
|
||||||
return doc_list
|
return doc_list
|
||||||
|
|
||||||
|
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||||
|
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
|
||||||
|
vector = embeddings.embed_query(query)
|
||||||
|
knn = {
|
||||||
|
"filter": [{"match": {"metadata.source_id.keyword": self.source_id}}],
|
||||||
|
"field": "vector",
|
||||||
|
"k": k,
|
||||||
|
"num_candidates": 100,
|
||||||
|
"query_vector": vector,
|
||||||
|
}
|
||||||
|
full_query = {
|
||||||
|
"knn": knn,
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{
|
||||||
|
"match": {
|
||||||
|
"text": {
|
||||||
|
"query": question,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"filter": [{"match": {"metadata.source_id.keyword": self.source_id}}],
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"rank": {"rrf": {}},
|
||||||
|
}
|
||||||
|
resp = self.docsearch.search(index=self.index_name, query=full_query['query'], size=k, knn=full_query['knn'])
|
||||||
|
|
||||||
|
docs_with_scores = []
|
||||||
|
for hit in resp['hits']['hits']:
|
||||||
|
score = hit['_score']
|
||||||
|
# Normalize the score. Elasticsearch returns a score of 1.0 + cosine similarity.
|
||||||
|
similarity = max(0, score - 1.0)
|
||||||
|
doc = Document(page_content=hit['_source']['text'], metadata=hit['_source']['metadata'])
|
||||||
|
docs_with_scores.append((doc, similarity))
|
||||||
|
|
||||||
|
return docs_with_scores
|
||||||
|
|
||||||
def _create_index_if_not_exists(
|
def _create_index_if_not_exists(
|
||||||
self, index_name, dims_length
|
self, index_name, dims_length
|
||||||
):
|
):
|
||||||
|
|||||||
@@ -63,6 +63,18 @@ class FaissStore(BaseVectorStore):
|
|||||||
def search(self, *args, **kwargs):
|
def search(self, *args, **kwargs):
|
||||||
return self.docsearch.similarity_search(*args, **kwargs)
|
return self.docsearch.similarity_search(*args, **kwargs)
|
||||||
|
|
||||||
|
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||||
|
docs_and_distances = self.docsearch.similarity_search_with_score(query, k, *args, **kwargs)
|
||||||
|
|
||||||
|
# Convert L2 distance to a normalized similarity score (0-1, higher is better)
|
||||||
|
docs_and_similarities = []
|
||||||
|
for doc, distance in docs_and_distances:
|
||||||
|
if distance < 0: distance = 0
|
||||||
|
similarity = 1 / (1 + distance)
|
||||||
|
docs_and_similarities.append((doc, similarity))
|
||||||
|
|
||||||
|
return docs_and_similarities
|
||||||
|
|
||||||
def add_texts(self, *args, **kwargs):
|
def add_texts(self, *args, **kwargs):
|
||||||
return self.docsearch.add_texts(*args, **kwargs)
|
return self.docsearch.add_texts(*args, **kwargs)
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ from typing import List, Optional
|
|||||||
import importlib
|
import importlib
|
||||||
from application.vectorstore.base import BaseVectorStore
|
from application.vectorstore.base import BaseVectorStore
|
||||||
from application.core.settings import settings
|
from application.core.settings import settings
|
||||||
|
from application.vectorstore.document_class import Document
|
||||||
|
|
||||||
|
|
||||||
class LanceDBVectorStore(BaseVectorStore):
|
class LanceDBVectorStore(BaseVectorStore):
|
||||||
"""Class for LanceDB Vector Store integration."""
|
"""Class for LanceDB Vector Store integration."""
|
||||||
@@ -87,6 +89,23 @@ class LanceDBVectorStore(BaseVectorStore):
|
|||||||
results = self.docsearch.search(query_embedding).limit(k).to_list()
|
results = self.docsearch.search(query_embedding).limit(k).to_list()
|
||||||
return [(result["_distance"], result["text"], result["metadata"]) for result in results]
|
return [(result["_distance"], result["text"], result["metadata"]) for result in results]
|
||||||
|
|
||||||
|
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||||
|
"""Perform a similarity search with scores."""
|
||||||
|
self.ensure_table_exists()
|
||||||
|
query_embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_query(query)
|
||||||
|
results = self.docsearch.search(query_embedding).limit(k).to_list()
|
||||||
|
|
||||||
|
docs_with_scores = []
|
||||||
|
for result in results:
|
||||||
|
distance = result.get('_distance', float('inf'))
|
||||||
|
if distance < 0: distance = 0
|
||||||
|
# Convert L2 distance to a normalized similarity score
|
||||||
|
similarity = 1 / (1 + distance)
|
||||||
|
doc = Document(page_content=result['text'], metadata=result["metadata"])
|
||||||
|
docs_with_scores.append((doc, similarity))
|
||||||
|
|
||||||
|
return docs_with_scores
|
||||||
|
|
||||||
def delete_index(self):
|
def delete_index(self):
|
||||||
"""Delete the entire LanceDB index (table)."""
|
"""Delete the entire LanceDB index (table)."""
|
||||||
if self.table:
|
if self.table:
|
||||||
|
|||||||
@@ -26,6 +26,16 @@ class MilvusStore(BaseVectorStore):
|
|||||||
expr = f"source_id == '{self._source_id}'"
|
expr = f"source_id == '{self._source_id}'"
|
||||||
return self._docsearch.similarity_search(query=question, k=k, expr=expr, *args, **kwargs)
|
return self._docsearch.similarity_search(query=question, k=k, expr=expr, *args, **kwargs)
|
||||||
|
|
||||||
|
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||||
|
expr = f"source_id == '{self._source_id}'"
|
||||||
|
docs_and_distances = self._docsearch.similarity_search_with_score(query, k, expr=expr, *args, **kwargs)
|
||||||
|
docs_with_scores = []
|
||||||
|
for doc, distance in docs_and_distances:
|
||||||
|
similarity = 1.0 - distance
|
||||||
|
docs_with_scores.append((doc, max(0, similarity)))
|
||||||
|
|
||||||
|
return docs_with_scores
|
||||||
|
|
||||||
def add_texts(self, texts: List[str], metadatas: Optional[List[dict]], *args, **kwargs):
|
def add_texts(self, texts: List[str], metadatas: Optional[List[dict]], *args, **kwargs):
|
||||||
ids = [str(uuid4()) for _ in range(len(texts))]
|
ids = [str(uuid4()) for _ in range(len(texts))]
|
||||||
|
|
||||||
|
|||||||
@@ -63,6 +63,40 @@ class MongoDBVectorStore(BaseVectorStore):
|
|||||||
results.append(Document(text, metadata))
|
results.append(Document(text, metadata))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||||
|
query_vector = self._embedding.embed_query(query)
|
||||||
|
|
||||||
|
pipeline = [
|
||||||
|
{
|
||||||
|
"$vectorSearch": {
|
||||||
|
"queryVector": query_vector,
|
||||||
|
"path": self._embedding_key,
|
||||||
|
"limit": k,
|
||||||
|
"numCandidates": k * 10,
|
||||||
|
"index": self._index_name,
|
||||||
|
"filter": {"source_id": {"$eq": self._source_id}},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$addFields": {
|
||||||
|
"score": {"$meta": "vectorSearchScore"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
cursor = self._collection.aggregate(pipeline)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for doc in cursor:
|
||||||
|
score = doc.pop("score", 0.0)
|
||||||
|
text = doc.pop(self._text_key)
|
||||||
|
doc.pop("_id")
|
||||||
|
doc.pop(self._embedding_key, None)
|
||||||
|
metadata = doc
|
||||||
|
doc = Document(page_content=text, metadata=metadata)
|
||||||
|
results.append((doc, score))
|
||||||
|
return results
|
||||||
|
|
||||||
def _insert_texts(self, texts, metadatas):
|
def _insert_texts(self, texts, metadatas):
|
||||||
if not texts:
|
if not texts:
|
||||||
return []
|
return []
|
||||||
|
|||||||
@@ -36,6 +36,9 @@ class QdrantStore(BaseVectorStore):
|
|||||||
def search(self, *args, **kwargs):
|
def search(self, *args, **kwargs):
|
||||||
return self._docsearch.similarity_search(filter=self._filter, *args, **kwargs)
|
return self._docsearch.similarity_search(filter=self._filter, *args, **kwargs)
|
||||||
|
|
||||||
|
def search_with_scores(self, query: str, k: int, *args, **kwargs):
|
||||||
|
return self._docsearch.similarity_search_with_score(query=query, k=k, filter=self._filter, *args, **kwargs)
|
||||||
|
|
||||||
def add_texts(self, *args, **kwargs):
|
def add_texts(self, *args, **kwargs):
|
||||||
return self._docsearch.add_texts(*args, **kwargs)
|
return self._docsearch.add_texts(*args, **kwargs)
|
||||||
|
|
||||||
|
|||||||
3224
frontend/package-lock.json
generated
3224
frontend/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -36,7 +36,7 @@ export default function General() {
|
|||||||
{ label: '繁體中文(臺灣)', value: 'zhTW' },
|
{ label: '繁體中文(臺灣)', value: 'zhTW' },
|
||||||
{ label: 'Русский', value: 'ru' },
|
{ label: 'Русский', value: 'ru' },
|
||||||
];
|
];
|
||||||
const chunks = ['0', '2', '4', '6', '8', '10'];
|
const chunks = ['Auto', '0', '2', '4', '6', '8', '10'];
|
||||||
const token_limits = new Map([
|
const token_limits = new Map([
|
||||||
[0, t('settings.general.none')],
|
[0, t('settings.general.none')],
|
||||||
[100, t('settings.general.low')],
|
[100, t('settings.general.low')],
|
||||||
|
|||||||
Reference in New Issue
Block a user