DocsGPT/application/retriever/classic_rag.py

import logging
import os

from application.core.settings import settings
from application.llm.llm_creator import LLMCreator
from application.retriever.base import BaseRetriever
from application.utils import num_tokens_from_string
from application.vectorstore.vector_creator import VectorCreator


class ClassicRAG(BaseRetriever):
    def __init__(
        self,
        source,
        chat_history=None,
        prompt="",
        chunks=2,
        doc_token_limit=50000,
        gpt_model="docsgpt",
        user_api_key=None,
        llm_name=settings.LLM_PROVIDER,
        api_key=settings.API_KEY,
        decoded_token=None,
    ):
        self.original_question = source.get("question", "")
        self.chat_history = chat_history if chat_history is not None else []
        self.prompt = prompt
        if isinstance(chunks, str):
            try:
                self.chunks = int(chunks)
            except ValueError:
                logging.warning(
                    f"Invalid chunks value '{chunks}', using default value 2"
                )
                self.chunks = 2
        else:
            self.chunks = chunks
        user_identifier = user_api_key if user_api_key else "default"
        logging.info(
            f"ClassicRAG initialized with chunks={self.chunks}, user_api_key={user_identifier}, "
            f"sources={'active_docs' in source and source['active_docs'] is not None}"
        )
        self.gpt_model = gpt_model
        self.doc_token_limit = doc_token_limit
        self.user_api_key = user_api_key
        self.llm_name = llm_name
        self.api_key = api_key
        self.llm = LLMCreator.create_llm(
            self.llm_name,
            api_key=self.api_key,
            user_api_key=self.user_api_key,
            decoded_token=decoded_token,
        )

        if "active_docs" in source and source["active_docs"] is not None:
            if isinstance(source["active_docs"], list):
                self.vectorstores = source["active_docs"]
            else:
                self.vectorstores = [source["active_docs"]]
        else:
            self.vectorstores = []
        self.question = self._rephrase_query()
        self.decoded_token = decoded_token
        self._validate_vectorstore_config()

    def _validate_vectorstore_config(self):
        """Validate vectorstore IDs and remove any empty/invalid entries"""
        if not self.vectorstores:
            logging.warning("No vectorstores configured for retrieval")
            return
        invalid_ids = [
            vs_id for vs_id in self.vectorstores if not vs_id or not vs_id.strip()
        ]
        if invalid_ids:
            logging.warning(f"Found invalid vectorstore IDs: {invalid_ids}")
            self.vectorstores = [
                vs_id for vs_id in self.vectorstores if vs_id and vs_id.strip()
            ]

    def _rephrase_query(self):
        """Rephrase user query with chat history context for better retrieval"""
        if (
            not self.original_question
            or not self.chat_history
            or self.chat_history == []
            or self.chunks == 0
            or not self.vectorstores
        ):
            return self.original_question
        prompt = (
            "Given the following conversation history:\n"
            f"{self.chat_history}\n\n"
            "Rephrase the following user question to be a standalone search query "
            "that captures all relevant context from the conversation:\n"
        )

        messages = [
            {"role": "system", "content": prompt},
            {"role": "user", "content": self.original_question},
        ]

        try:
            rephrased_query = self.llm.gen(model=self.gpt_model, messages=messages)
            print(f"Rephrased query: {rephrased_query}")
            return rephrased_query if rephrased_query else self.original_question
        except Exception as e:
            logging.error(f"Error rephrasing query: {e}", exc_info=True)
            return self.original_question

    def _get_data(self):
        if self.chunks == 0 or not self.vectorstores:
            logging.info(
                f"ClassicRAG._get_data: Skipping retrieval - chunks={self.chunks}, "
                f"vectorstores_count={len(self.vectorstores) if self.vectorstores else 0}"
            )
            return []

        all_docs = []
        chunks_per_source = max(1, self.chunks // len(self.vectorstores))
        token_budget = max(int(self.doc_token_limit * 0.9), 100)
        cumulative_tokens = 0

        for vectorstore_id in self.vectorstores:
            if vectorstore_id:
                try:
                    docsearch = VectorCreator.create_vectorstore(
                        settings.VECTOR_STORE, vectorstore_id, settings.EMBEDDINGS_KEY
                    )
                    docs_temp = docsearch.search(
                        self.question, k=max(chunks_per_source * 2, 20)
                    )

                    for doc in docs_temp:
                        if cumulative_tokens >= token_budget:
                            break

                        if hasattr(doc, "page_content") and hasattr(doc, "metadata"):
                            page_content = doc.page_content
                            metadata = doc.metadata
                        else:
                            page_content = doc.get("text", doc.get("page_content", ""))
                            metadata = doc.get("metadata", {})

                        title = metadata.get(
                            "title", metadata.get("post_title", page_content)
                        )
                        if not isinstance(title, str):
                            title = str(title)
                        title = title.split("/")[-1]

                        filename = (
                            metadata.get("filename")
                            or metadata.get("file_name")
                            or metadata.get("source")
                        )
                        if isinstance(filename, str):
                            filename = os.path.basename(filename) or filename
                        else:
                            filename = title
                        if not filename:
                            filename = title
                        source_path = metadata.get("source") or vectorstore_id

                        doc_text_with_header = f"{filename}\n{page_content}"
                        doc_tokens = num_tokens_from_string(doc_text_with_header)

                        if cumulative_tokens + doc_tokens < token_budget:
                            all_docs.append(
                                {
                                    "title": title,
                                    "text": page_content,
                                    "source": source_path,
                                    "filename": filename,
                                }
                            )
                            cumulative_tokens += doc_tokens

                    if cumulative_tokens >= token_budget:
                        break

                except Exception as e:
                    logging.error(
                        f"Error searching vectorstore {vectorstore_id}: {e}",
                        exc_info=True,
                    )
                    continue

        logging.info(
            f"ClassicRAG._get_data: Retrieval complete - retrieved {len(all_docs)} documents "
            f"(requested chunks={self.chunks}, chunks_per_source={chunks_per_source}, "
            f"cumulative_tokens={cumulative_tokens}/{token_budget})"
        )
        return all_docs

    def search(self, query: str = ""):
        """Search for documents using optional query override"""
        if query:
            self.original_question = query
            self.question = self._rephrase_query()
        return self._get_data()