diff --git a/application/parser/file/rst_parser.py b/application/parser/file/rst_parser.py index 633ec844..d39a0837 100644 --- a/application/parser/file/rst_parser.py +++ b/application/parser/file/rst_parser.py @@ -91,6 +91,25 @@ class RstParser(BaseParser): ] return rst_tups + def chunk_by_token_count(self, text: str, max_tokens: int = 100) -> List[str]: + """Chunk text by token count.""" + + avg_token_length = 5 + + chunk_size = max_tokens * avg_token_length + + chunks = [] + for i in range(0, len(text), chunk_size): + chunk = text[i:i+chunk_size] + if i + chunk_size < len(text): + last_space = chunk.rfind(' ') + if last_space != -1: + chunk = chunk[:last_space] + + chunks.append(chunk.strip()) + + return chunks + def remove_images(self, content: str) -> str: pattern = r"\.\. image:: (.*)" content = re.sub(pattern, "", content) @@ -136,7 +155,7 @@ class RstParser(BaseParser): return {} def parse_tups( - self, filepath: Path, errors: str = "ignore" + self, filepath: Path, errors: str = "ignore",max_tokens: Optional[int] = 1000 ) -> List[Tuple[Optional[str], str]]: """Parse file into tuples.""" with open(filepath, "r") as f: @@ -156,6 +175,15 @@ class RstParser(BaseParser): rst_tups = self.remove_whitespaces_excess(rst_tups) if self._remove_characters_excess: rst_tups = self.remove_characters_excess(rst_tups) + + # Apply chunking if max_tokens is provided + if max_tokens is not None: + chunked_tups = [] + for header, text in rst_tups: + chunks = self.chunk_by_token_count(text, max_tokens) + for idx, chunk in enumerate(chunks): + chunked_tups.append((f"{header} - Chunk {idx + 1}", chunk)) + return chunked_tups return rst_tups def parse_file(