diff --git a/application/parser/file/markdown_parser.py b/application/parser/file/markdown_parser.py index d906e9b6..59991581 100644 --- a/application/parser/file/markdown_parser.py +++ b/application/parser/file/markdown_parser.py @@ -7,8 +7,8 @@ import re from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union, cast -import tiktoken from application.parser.file.base_parser import BaseParser +from application.utils import num_tokens_from_string class MarkdownParser(BaseParser): @@ -38,7 +38,7 @@ class MarkdownParser(BaseParser): def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str): """Append to tups chunk.""" - num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text)) + num_tokens = num_tokens_from_string(current_text) if num_tokens > self._max_tokens: chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)] for chunk in chunks: diff --git a/tests/parser/file/test_markdown_parser.py b/tests/parser/file/test_markdown_parser.py index 826671f3..c3cb61ea 100644 --- a/tests/parser/file/test_markdown_parser.py +++ b/tests/parser/file/test_markdown_parser.py @@ -2,9 +2,9 @@ from pathlib import Path from unittest.mock import mock_open, patch import pytest -import tiktoken from application.parser.file.markdown_parser import MarkdownParser +from application import utils class _Enc: @@ -14,7 +14,7 @@ class _Enc: @pytest.fixture(autouse=True) def _patch_tokenizer(monkeypatch): - monkeypatch.setattr(tiktoken, "get_encoding", lambda _: _Enc()) + monkeypatch.setattr(utils, "get_encoding", lambda: _Enc()) def test_markdown_init_parser(): parser = MarkdownParser() @@ -57,4 +57,3 @@ def test_markdown_token_chunking_via_max_tokens(): assert len(tups) > 1 for _hdr, chunk in tups: assert len(chunk) <= 4 -