mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-01-20 05:50:58 +00:00
fix: tiktoken import in markdown parser
This commit is contained in:
@@ -7,8 +7,8 @@ import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
|
||||
import tiktoken
|
||||
from application.parser.file.base_parser import BaseParser
|
||||
from application.utils import num_tokens_from_string
|
||||
|
||||
|
||||
class MarkdownParser(BaseParser):
|
||||
@@ -38,7 +38,7 @@ class MarkdownParser(BaseParser):
|
||||
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
|
||||
current_text: str):
|
||||
"""Append to tups chunk."""
|
||||
num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
|
||||
num_tokens = num_tokens_from_string(current_text)
|
||||
if num_tokens > self._max_tokens:
|
||||
chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)]
|
||||
for chunk in chunks:
|
||||
|
||||
Reference in New Issue
Block a user