diff --git a/scripts/ingest.py b/scripts/ingest.py index 9277be7a..75f98a2e 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -44,7 +44,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, help="Maximum number of files to read."), formats: Optional[List[str]] = typer.Option([".rst", ".md"], help="""List of required extensions (list with .) - Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html"""), + Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""), exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")): """ diff --git a/scripts/parser/file/bulk.py b/scripts/parser/file/bulk.py index fb0c0ea2..cebc0b50 100644 --- a/scripts/parser/file/bulk.py +++ b/scripts/parser/file/bulk.py @@ -21,6 +21,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".md": MarkdownParser(), ".rst": RstParser(), ".html": HTMLParser(), + ".mdx": MarkdownParser(), } diff --git a/scripts/parser/file/markdown_parser.py b/scripts/parser/file/markdown_parser.py index 5c94ace2..2dd9e430 100644 --- a/scripts/parser/file/markdown_parser.py +++ b/scripts/parser/file/markdown_parser.py @@ -8,6 +8,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union, cast from parser.file.base_parser import BaseParser +import tiktoken class MarkdownParser(BaseParser): @@ -23,6 +24,7 @@ class MarkdownParser(BaseParser): *args: Any, remove_hyperlinks: bool = True, remove_images: bool = True, + max_tokens: int = 2048, # remove_tables: bool = True, **kwargs: Any, ) -> None: @@ -30,8 +32,20 @@ class MarkdownParser(BaseParser): super().__init__(*args, **kwargs) self._remove_hyperlinks = remove_hyperlinks self._remove_images = remove_images + self._max_tokens = max_tokens # self._remove_tables = remove_tables + + def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str): + """Append to tups chunk.""" + num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text)) + if num_tokens > self._max_tokens: + chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)] + for chunk in chunks: + tups.append((current_header, chunk)) + else: + tups.append((current_header, current_text)) + return tups def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: """Convert a markdown file to a dictionary. @@ -50,13 +64,13 @@ class MarkdownParser(BaseParser): if current_header is not None: if current_text == "" or None: continue - markdown_tups.append((current_header, current_text)) + markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text) current_header = line current_text = "" else: current_text += line + "\n" - markdown_tups.append((current_header, current_text)) + markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text) if current_header is not None: # pass linting, assert keys are defined