fix: chunking

This commit is contained in:
Alex
2025-09-24 22:59:53 +01:00
parent f153435c08
commit 56256051d2
2 changed files with 27 additions and 5 deletions

View File

@@ -264,7 +264,15 @@ class BaseAgent(ABC):
query: str,
retrieved_data: List[Dict],
) -> List[Dict]:
docs_together = "\n".join([doc["text"] for doc in retrieved_data])
docs_with_filenames = []
for doc in retrieved_data:
filename = doc.get("filename") or doc.get("title") or doc.get("source")
if filename:
chunk_header = str(filename)
docs_with_filenames.append(f"{chunk_header}\n{doc['text']}")
else:
docs_with_filenames.append(doc["text"])
docs_together = "\n\n".join(docs_with_filenames)
p_chat_combine = system_prompt.replace("{summaries}", docs_together)
messages_combine = [{"role": "system", "content": p_chat_combine}]

View File

@@ -1,4 +1,5 @@
import logging
import os
from application.core.settings import settings
from application.llm.llm_creator import LLMCreator
@@ -141,15 +142,28 @@ class ClassicRAG(BaseRetriever):
title = metadata.get(
"title", metadata.get("post_title", page_content)
)
if isinstance(title, str):
title = title.split("/")[-1]
if not isinstance(title, str):
title = str(title)
title = title.split("/")[-1]
filename = (
metadata.get("filename")
or metadata.get("file_name")
or metadata.get("source")
)
if isinstance(filename, str):
filename = os.path.basename(filename) or filename
else:
title = str(title).split("/")[-1]
filename = title
if not filename:
filename = title
source_path = metadata.get("source") or vectorstore_id
all_docs.append(
{
"title": title,
"text": page_content,
"source": metadata.get("source") or vectorstore_id,
"source": source_path,
"filename": filename,
}
)
except Exception as e: