mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 16:43:16 +00:00
feat: better token counter
This commit is contained in:
@@ -1,6 +1,22 @@
|
||||
from transformers import GPT2TokenizerFast
|
||||
import tiktoken
|
||||
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
|
||||
tokenizer.model_max_length = 100000
|
||||
def count_tokens(string):
|
||||
return len(tokenizer(string)['input_ids'])
|
||||
_encoding = None
|
||||
|
||||
def get_encoding():
|
||||
global _encoding
|
||||
if _encoding is None:
|
||||
_encoding = tiktoken.get_encoding("cl100k_base")
|
||||
return _encoding
|
||||
|
||||
def num_tokens_from_string(string: str) -> int:
|
||||
encoding = get_encoding()
|
||||
num_tokens = len(encoding.encode(string))
|
||||
return num_tokens
|
||||
|
||||
def count_tokens_docs(docs):
|
||||
docs_content = ""
|
||||
for doc in docs:
|
||||
docs_content += doc.page_content
|
||||
|
||||
tokens = num_tokens_from_string(docs_content)
|
||||
return tokens
|
||||
Reference in New Issue
Block a user