feat: better token counter

This commit is contained in:
Alex
2024-08-31 17:07:40 +01:00
parent c49b7613e0
commit d9309ebc6e
6 changed files with 34 additions and 40 deletions

View File

@@ -1,6 +1,22 @@
from transformers import GPT2TokenizerFast
import tiktoken
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
tokenizer.model_max_length = 100000
def count_tokens(string):
return len(tokenizer(string)['input_ids'])
_encoding = None
def get_encoding():
global _encoding
if _encoding is None:
_encoding = tiktoken.get_encoding("cl100k_base")
return _encoding
def num_tokens_from_string(string: str) -> int:
encoding = get_encoding()
num_tokens = len(encoding.encode(string))
return num_tokens
def count_tokens_docs(docs):
docs_content = ""
for doc in docs:
docs_content += doc.page_content
tokens = num_tokens_from_string(docs_content)
return tokens