Proper PEP8 formatting

This commit is contained in:
Anton Larin
2023-05-12 12:02:25 +02:00
parent 7f56f57778
commit 168648e789
14 changed files with 139 additions and 123 deletions

View File

@@ -13,6 +13,7 @@ def separate_header_and_body(text):
body = text[len(header):]
return header, body
def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
docs = []
current_group = None
@@ -23,7 +24,8 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
if current_group is None:
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
elif len(tiktoken.get_encoding("cl100k_base").encode(
current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
current_group.text += " " + doc.text
else:
docs.append(current_group)
@@ -35,6 +37,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
return docs
def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
docs = []
for doc in documents:
@@ -54,6 +57,7 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
docs.append(new_doc)
return docs
def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
if token_check == False:
return documents