This commit is contained in:
ManishMadan2882
2024-11-22 17:31:06 +05:30
2 changed files with 30 additions and 2 deletions

View File

@@ -123,7 +123,7 @@ docker compose -f docker-compose-dev.yaml up -d
### Run the Backend
> [!Note]
> Make sure you have Python 3.10 or 3.11 installed.
> Make sure you have Python 3.12 installed.
1. Export required environment variables or prepare a `.env` file in the project folder:
- Copy [.env-template](https://github.com/arc53/DocsGPT/blob/main/application/.env-template) and create `.env`.

View File

@@ -91,6 +91,25 @@ class RstParser(BaseParser):
]
return rst_tups
def chunk_by_token_count(self, text: str, max_tokens: int = 100) -> List[str]:
"""Chunk text by token count."""
avg_token_length = 5
chunk_size = max_tokens * avg_token_length
chunks = []
for i in range(0, len(text), chunk_size):
chunk = text[i:i+chunk_size]
if i + chunk_size < len(text):
last_space = chunk.rfind(' ')
if last_space != -1:
chunk = chunk[:last_space]
chunks.append(chunk.strip())
return chunks
def remove_images(self, content: str) -> str:
pattern = r"\.\. image:: (.*)"
content = re.sub(pattern, "", content)
@@ -136,7 +155,7 @@ class RstParser(BaseParser):
return {}
def parse_tups(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore",max_tokens: Optional[int] = 1000
) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples."""
with open(filepath, "r") as f:
@@ -156,6 +175,15 @@ class RstParser(BaseParser):
rst_tups = self.remove_whitespaces_excess(rst_tups)
if self._remove_characters_excess:
rst_tups = self.remove_characters_excess(rst_tups)
# Apply chunking if max_tokens is provided
if max_tokens is not None:
chunked_tups = []
for header, text in rst_tups:
chunks = self.chunk_by_token_count(text, max_tokens)
for idx, chunk in enumerate(chunks):
chunked_tups.append((f"{header} - Chunk {idx + 1}", chunk))
return chunked_tups
return rst_tups
def parse_file(