mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 00:23:17 +00:00
* feat: template-based prompt rendering with dynamic namespace injection * refactor: improve template engine initialization with clearer formatting * refactor: streamline ReActAgent methods and improve content extraction logic feat: enhance error handling in NamespaceManager and TemplateEngine fix: update NewAgent component to ensure consistent form data submission test: modify tests for ReActAgent and prompt renderer to reflect method changes and improve coverage * feat: tools namespace + three-tier token budget * refactor: remove unused variable assignment in message building tests * Enhance prompt customization and tool pre-fetching functionality * ruff lint fix * refactor: cleaner error handling and reduce code clutter --------- Co-authored-by: Alex <a@tushynski.me>
201 lines
7.8 KiB
Python
201 lines
7.8 KiB
Python
import logging
|
|
import os
|
|
|
|
from application.core.settings import settings
|
|
from application.llm.llm_creator import LLMCreator
|
|
from application.retriever.base import BaseRetriever
|
|
from application.utils import num_tokens_from_string
|
|
from application.vectorstore.vector_creator import VectorCreator
|
|
|
|
|
|
class ClassicRAG(BaseRetriever):
|
|
def __init__(
|
|
self,
|
|
source,
|
|
chat_history=None,
|
|
prompt="",
|
|
chunks=2,
|
|
doc_token_limit=50000,
|
|
gpt_model="docsgpt",
|
|
user_api_key=None,
|
|
llm_name=settings.LLM_PROVIDER,
|
|
api_key=settings.API_KEY,
|
|
decoded_token=None,
|
|
):
|
|
self.original_question = source.get("question", "")
|
|
self.chat_history = chat_history if chat_history is not None else []
|
|
self.prompt = prompt
|
|
if isinstance(chunks, str):
|
|
try:
|
|
self.chunks = int(chunks)
|
|
except ValueError:
|
|
logging.warning(
|
|
f"Invalid chunks value '{chunks}', using default value 2"
|
|
)
|
|
self.chunks = 2
|
|
else:
|
|
self.chunks = chunks
|
|
user_identifier = user_api_key if user_api_key else "default"
|
|
logging.info(
|
|
f"ClassicRAG initialized with chunks={self.chunks}, user_api_key={user_identifier}, "
|
|
f"sources={'active_docs' in source and source['active_docs'] is not None}"
|
|
)
|
|
self.gpt_model = gpt_model
|
|
self.doc_token_limit = doc_token_limit
|
|
self.user_api_key = user_api_key
|
|
self.llm_name = llm_name
|
|
self.api_key = api_key
|
|
self.llm = LLMCreator.create_llm(
|
|
self.llm_name,
|
|
api_key=self.api_key,
|
|
user_api_key=self.user_api_key,
|
|
decoded_token=decoded_token,
|
|
)
|
|
|
|
if "active_docs" in source and source["active_docs"] is not None:
|
|
if isinstance(source["active_docs"], list):
|
|
self.vectorstores = source["active_docs"]
|
|
else:
|
|
self.vectorstores = [source["active_docs"]]
|
|
else:
|
|
self.vectorstores = []
|
|
self.question = self._rephrase_query()
|
|
self.decoded_token = decoded_token
|
|
self._validate_vectorstore_config()
|
|
|
|
def _validate_vectorstore_config(self):
|
|
"""Validate vectorstore IDs and remove any empty/invalid entries"""
|
|
if not self.vectorstores:
|
|
logging.warning("No vectorstores configured for retrieval")
|
|
return
|
|
invalid_ids = [
|
|
vs_id for vs_id in self.vectorstores if not vs_id or not vs_id.strip()
|
|
]
|
|
if invalid_ids:
|
|
logging.warning(f"Found invalid vectorstore IDs: {invalid_ids}")
|
|
self.vectorstores = [
|
|
vs_id for vs_id in self.vectorstores if vs_id and vs_id.strip()
|
|
]
|
|
|
|
def _rephrase_query(self):
|
|
"""Rephrase user query with chat history context for better retrieval"""
|
|
if (
|
|
not self.original_question
|
|
or not self.chat_history
|
|
or self.chat_history == []
|
|
or self.chunks == 0
|
|
or not self.vectorstores
|
|
):
|
|
return self.original_question
|
|
prompt = (
|
|
"Given the following conversation history:\n"
|
|
f"{self.chat_history}\n\n"
|
|
"Rephrase the following user question to be a standalone search query "
|
|
"that captures all relevant context from the conversation:\n"
|
|
)
|
|
|
|
messages = [
|
|
{"role": "system", "content": prompt},
|
|
{"role": "user", "content": self.original_question},
|
|
]
|
|
|
|
try:
|
|
rephrased_query = self.llm.gen(model=self.gpt_model, messages=messages)
|
|
print(f"Rephrased query: {rephrased_query}")
|
|
return rephrased_query if rephrased_query else self.original_question
|
|
except Exception as e:
|
|
logging.error(f"Error rephrasing query: {e}", exc_info=True)
|
|
return self.original_question
|
|
|
|
def _get_data(self):
|
|
if self.chunks == 0 or not self.vectorstores:
|
|
logging.info(
|
|
f"ClassicRAG._get_data: Skipping retrieval - chunks={self.chunks}, "
|
|
f"vectorstores_count={len(self.vectorstores) if self.vectorstores else 0}"
|
|
)
|
|
return []
|
|
|
|
all_docs = []
|
|
chunks_per_source = max(1, self.chunks // len(self.vectorstores))
|
|
token_budget = max(int(self.doc_token_limit * 0.9), 100)
|
|
cumulative_tokens = 0
|
|
|
|
for vectorstore_id in self.vectorstores:
|
|
if vectorstore_id:
|
|
try:
|
|
docsearch = VectorCreator.create_vectorstore(
|
|
settings.VECTOR_STORE, vectorstore_id, settings.EMBEDDINGS_KEY
|
|
)
|
|
docs_temp = docsearch.search(
|
|
self.question, k=max(chunks_per_source * 2, 20)
|
|
)
|
|
|
|
for doc in docs_temp:
|
|
if cumulative_tokens >= token_budget:
|
|
break
|
|
|
|
if hasattr(doc, "page_content") and hasattr(doc, "metadata"):
|
|
page_content = doc.page_content
|
|
metadata = doc.metadata
|
|
else:
|
|
page_content = doc.get("text", doc.get("page_content", ""))
|
|
metadata = doc.get("metadata", {})
|
|
|
|
title = metadata.get(
|
|
"title", metadata.get("post_title", page_content)
|
|
)
|
|
if not isinstance(title, str):
|
|
title = str(title)
|
|
title = title.split("/")[-1]
|
|
|
|
filename = (
|
|
metadata.get("filename")
|
|
or metadata.get("file_name")
|
|
or metadata.get("source")
|
|
)
|
|
if isinstance(filename, str):
|
|
filename = os.path.basename(filename) or filename
|
|
else:
|
|
filename = title
|
|
if not filename:
|
|
filename = title
|
|
source_path = metadata.get("source") or vectorstore_id
|
|
|
|
doc_text_with_header = f"{filename}\n{page_content}"
|
|
doc_tokens = num_tokens_from_string(doc_text_with_header)
|
|
|
|
if cumulative_tokens + doc_tokens < token_budget:
|
|
all_docs.append(
|
|
{
|
|
"title": title,
|
|
"text": page_content,
|
|
"source": source_path,
|
|
"filename": filename,
|
|
}
|
|
)
|
|
cumulative_tokens += doc_tokens
|
|
|
|
if cumulative_tokens >= token_budget:
|
|
break
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f"Error searching vectorstore {vectorstore_id}: {e}",
|
|
exc_info=True,
|
|
)
|
|
continue
|
|
|
|
logging.info(
|
|
f"ClassicRAG._get_data: Retrieval complete - retrieved {len(all_docs)} documents "
|
|
f"(requested chunks={self.chunks}, chunks_per_source={chunks_per_source}, "
|
|
f"cumulative_tokens={cumulative_tokens}/{token_budget})"
|
|
)
|
|
return all_docs
|
|
|
|
def search(self, query: str = ""):
|
|
"""Search for documents using optional query override"""
|
|
if query:
|
|
self.original_question = query
|
|
self.question = self._rephrase_query()
|
|
return self._get_data()
|