From a2ef45e13f57976f84887f7825afbd49dc0b6440 Mon Sep 17 00:00:00 2001 From: jayantp2003 Date: Fri, 11 Oct 2024 17:08:04 +0530 Subject: [PATCH] Fix #859: Resolve issue with large zip breaking stream endpoint --- application/parser/file/rst_parser.py | 53 ++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/application/parser/file/rst_parser.py b/application/parser/file/rst_parser.py index 633ec844..eb9043b2 100644 --- a/application/parser/file/rst_parser.py +++ b/application/parser/file/rst_parser.py @@ -91,6 +91,48 @@ class RstParser(BaseParser): ] return rst_tups + def chunk_by_token_count(self, text: str, max_tokens: int = 100) -> List[str]: + """Chunk text by token count.""" + # words = text.split() + # chunks = [] + # current_chunk = [] + # current_token_count = 0 + + # for word in words: + # word_token_len = len(word.split()) # Token count + # if current_token_count + word_token_len > max_tokens: + # chunks.append(" ".join(current_chunk)) + # current_chunk = [] + # current_token_count = 0 + # current_chunk.append(word) + # current_token_count += word_token_len + + # if current_chunk: + # chunks.append(" ".join(current_chunk)) + + # return chunks + + + avg_token_length = 5 + + # Calculate approximate chunk size in characters + chunk_size = max_tokens * avg_token_length + + # Split text into chunks + chunks = [] + for i in range(0, len(text), chunk_size): + chunk = text[i:i+chunk_size] + + # Adjust chunk to end at a word boundary + if i + chunk_size < len(text): + last_space = chunk.rfind(' ') + if last_space != -1: + chunk = chunk[:last_space] + + chunks.append(chunk.strip()) + + return chunks + def remove_images(self, content: str) -> str: pattern = r"\.\. image:: (.*)" content = re.sub(pattern, "", content) @@ -136,7 +178,7 @@ class RstParser(BaseParser): return {} def parse_tups( - self, filepath: Path, errors: str = "ignore" + self, filepath: Path, errors: str = "ignore",max_tokens: Optional[int] = 1000 ) -> List[Tuple[Optional[str], str]]: """Parse file into tuples.""" with open(filepath, "r") as f: @@ -156,6 +198,15 @@ class RstParser(BaseParser): rst_tups = self.remove_whitespaces_excess(rst_tups) if self._remove_characters_excess: rst_tups = self.remove_characters_excess(rst_tups) + + # Apply chunking if max_tokens is provided + if max_tokens is not None: + chunked_tups = [] + for header, text in rst_tups: + chunks = self.chunk_by_token_count(text, max_tokens) + for idx, chunk in enumerate(chunks): + chunked_tups.append((f"{header} - Chunk {idx + 1}", chunk)) + return chunked_tups return rst_tups def parse_file(