feat: better token counter

2026-02-11 16:51:04 +00:00 · 2024-08-31 17:07:40 +01:00
parent c49b7613e0
commit d9309ebc6e
6 changed files with 34 additions and 40 deletions
--- a/application/utils.py
+++ b/application/utils.py
@@ -1,6 +1,22 @@
-from transformers import GPT2TokenizerFast
+import tiktoken

-tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
-tokenizer.model_max_length = 100000
-def count_tokens(string):
-    return len(tokenizer(string)['input_ids'])
+_encoding = None
+
+def get_encoding():
+    global _encoding
+    if _encoding is None:
+        _encoding = tiktoken.get_encoding("cl100k_base")
+    return _encoding
+
+def num_tokens_from_string(string: str) -> int:
+    encoding = get_encoding()
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
+
+def count_tokens_docs(docs):
+    docs_content = ""
+    for doc in docs:
+        docs_content += doc.page_content
+
+    tokens = num_tokens_from_string(docs_content)
+    return tokens