mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-30 00:53:14 +00:00
Merge branch 'main' of https://github.com/ManishMadan2882/docsgpt
This commit is contained in:
@@ -123,7 +123,7 @@ docker compose -f docker-compose-dev.yaml up -d
|
||||
### Run the Backend
|
||||
|
||||
> [!Note]
|
||||
> Make sure you have Python 3.10 or 3.11 installed.
|
||||
> Make sure you have Python 3.12 installed.
|
||||
|
||||
1. Export required environment variables or prepare a `.env` file in the project folder:
|
||||
- Copy [.env-template](https://github.com/arc53/DocsGPT/blob/main/application/.env-template) and create `.env`.
|
||||
|
||||
@@ -91,6 +91,25 @@ class RstParser(BaseParser):
|
||||
]
|
||||
return rst_tups
|
||||
|
||||
def chunk_by_token_count(self, text: str, max_tokens: int = 100) -> List[str]:
|
||||
"""Chunk text by token count."""
|
||||
|
||||
avg_token_length = 5
|
||||
|
||||
chunk_size = max_tokens * avg_token_length
|
||||
|
||||
chunks = []
|
||||
for i in range(0, len(text), chunk_size):
|
||||
chunk = text[i:i+chunk_size]
|
||||
if i + chunk_size < len(text):
|
||||
last_space = chunk.rfind(' ')
|
||||
if last_space != -1:
|
||||
chunk = chunk[:last_space]
|
||||
|
||||
chunks.append(chunk.strip())
|
||||
|
||||
return chunks
|
||||
|
||||
def remove_images(self, content: str) -> str:
|
||||
pattern = r"\.\. image:: (.*)"
|
||||
content = re.sub(pattern, "", content)
|
||||
@@ -136,7 +155,7 @@ class RstParser(BaseParser):
|
||||
return {}
|
||||
|
||||
def parse_tups(
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
self, filepath: Path, errors: str = "ignore",max_tokens: Optional[int] = 1000
|
||||
) -> List[Tuple[Optional[str], str]]:
|
||||
"""Parse file into tuples."""
|
||||
with open(filepath, "r") as f:
|
||||
@@ -156,6 +175,15 @@ class RstParser(BaseParser):
|
||||
rst_tups = self.remove_whitespaces_excess(rst_tups)
|
||||
if self._remove_characters_excess:
|
||||
rst_tups = self.remove_characters_excess(rst_tups)
|
||||
|
||||
# Apply chunking if max_tokens is provided
|
||||
if max_tokens is not None:
|
||||
chunked_tups = []
|
||||
for header, text in rst_tups:
|
||||
chunks = self.chunk_by_token_count(text, max_tokens)
|
||||
for idx, chunk in enumerate(chunks):
|
||||
chunked_tups.append((f"{header} - Chunk {idx + 1}", chunk))
|
||||
return chunked_tups
|
||||
return rst_tups
|
||||
|
||||
def parse_file(
|
||||
|
||||
Reference in New Issue
Block a user