Merge pull request #295 from larinam/min_tokens_logic_fix

Fix min_tokens logic for grouping documents
This commit is contained in:
Alex
2023-08-05 14:36:34 +01:00
committed by GitHub
2 changed files with 2 additions and 2 deletions

View File

@@ -25,7 +25,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(
current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
current_group.text += " " + doc.text
else:
docs.append(current_group)

View File

@@ -24,7 +24,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(
current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
current_group.text += " " + doc.text
else:
docs.append(current_group)