mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
Fix min_tokens logic for grouping documents: documents with (lengh >= min_tokens) should not be grouped into one document for indexing
This commit is contained in:
@@ -25,7 +25,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
|
||||
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
|
||||
extra_info=doc.extra_info)
|
||||
elif len(tiktoken.get_encoding("cl100k_base").encode(
|
||||
current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
|
||||
current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
|
||||
current_group.text += " " + doc.text
|
||||
else:
|
||||
docs.append(current_group)
|
||||
|
||||
@@ -24,7 +24,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
|
||||
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
|
||||
extra_info=doc.extra_info)
|
||||
elif len(tiktoken.get_encoding("cl100k_base").encode(
|
||||
current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
|
||||
current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
|
||||
current_group.text += " " + doc.text
|
||||
else:
|
||||
docs.append(current_group)
|
||||
|
||||
Reference in New Issue
Block a user