fix: tiktoken import in markdown parser

This commit is contained in:
Alex
2026-01-12 23:04:20 +00:00
parent f1d714b5c1
commit 2c55c6cd9a
2 changed files with 4 additions and 5 deletions

View File

@@ -7,8 +7,8 @@ import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
import tiktoken
from application.parser.file.base_parser import BaseParser
from application.utils import num_tokens_from_string
class MarkdownParser(BaseParser):
@@ -38,7 +38,7 @@ class MarkdownParser(BaseParser):
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
current_text: str):
"""Append to tups chunk."""
num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
num_tokens = num_tokens_from_string(current_text)
if num_tokens > self._max_tokens:
chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)]
for chunk in chunks:

View File

@@ -2,9 +2,9 @@ from pathlib import Path
from unittest.mock import mock_open, patch
import pytest
import tiktoken
from application.parser.file.markdown_parser import MarkdownParser
from application import utils
class _Enc:
@@ -14,7 +14,7 @@ class _Enc:
@pytest.fixture(autouse=True)
def _patch_tokenizer(monkeypatch):
monkeypatch.setattr(tiktoken, "get_encoding", lambda _: _Enc())
monkeypatch.setattr(utils, "get_encoding", lambda: _Enc())
def test_markdown_init_parser():
parser = MarkdownParser()
@@ -57,4 +57,3 @@ def test_markdown_token_chunking_via_max_tokens():
assert len(tups) > 1
for _hdr, chunk in tups:
assert len(chunk) <= 4