Fix min_tokens logic for grouping documents: documents with (lengh >= min_tokens) should not be grouped into one document for indexing

This commit is contained in:
Anton Larin
2023-08-05 13:17:37 +02:00
parent 1687e6682a
commit bed25b317c
2 changed files with 2 additions and 2 deletions

View File

@@ -24,7 +24,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(
current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
current_group.text += " " + doc.text
else:
docs.append(current_group)