From bbce872ac5e65b8a032689ffb745a547157d4f3b Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 4 Jul 2025 02:19:58 +0530 Subject: [PATCH] (fix:chunker) combine metadata as well --- application/parser/chunking.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/application/parser/chunking.py b/application/parser/chunking.py index aae14898..f2a69dac 100644 --- a/application/parser/chunking.py +++ b/application/parser/chunking.py @@ -35,11 +35,33 @@ class Chunker: def combine_documents(self, doc: Document, next_doc: Document) -> Document: combined_text = doc.text + " " + next_doc.text combined_token_count = len(self.encoding.encode(combined_text)) + + combined_extra_info = {**(doc.extra_info or {}), "token_count": combined_token_count} + + sources = [] + if doc.extra_info and 'source' in doc.extra_info: + sources.append(doc.extra_info['source']) + if next_doc.extra_info and 'source' in next_doc.extra_info: + sources.append(next_doc.extra_info['source']) + + if sources: + combined_extra_info['source'] = sources + + titles = [] + if doc.extra_info and 'title' in doc.extra_info: + titles.append(doc.extra_info['title']) + if next_doc.extra_info and 'title' in next_doc.extra_info: + titles.append(next_doc.extra_info['title']) + + # Store combined title + if titles: + combined_extra_info['title'] = ", ".join(titles) + new_doc = Document( text=combined_text, doc_id=doc.doc_id, embedding=doc.embedding, - extra_info={**(doc.extra_info or {}), "token_count": combined_token_count} + extra_info=combined_extra_info ) return new_doc