This commit is contained in:
Pavel
2023-03-13 19:14:33 +04:00
parent a44cde33ed
commit bac25112b7
2 changed files with 77 additions and 15 deletions

View File

@@ -1,13 +1,17 @@
import os
import re
import sys
import nltk
import dotenv
import typer
import ast
import tiktoken
from math import ceil
from collections import defaultdict
from pathlib import Path
from typing import List, Optional
from typing import List, Optional, Tuple
from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -28,6 +32,57 @@ nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
def group_documents(documents: List[Document], min_tokens: int = 50, max_tokens: int = 2000) -> List[Document]:
groups = []
current_group = None
for doc in documents:
doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
if current_group is None:
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
current_group.text += " " + doc.text
else:
groups.append(current_group)
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
if current_group is not None:
groups.append(current_group)
return groups
def separate_header_and_body(text):
header_pattern = r"^(.*?\n){3}"
match = re.match(header_pattern, text)
header = match.group(0)
body = text[len(header):]
return header, body
def split_documents(documents: List[Document], max_tokens: int = 2000) -> List[Document]:
new_documents = []
for doc in documents:
token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
print(token_length)
if token_length <= max_tokens:
new_documents.append(doc)
else:
header, body = separate_header_and_body(doc.text)
num_body_parts = ceil(token_length / max_tokens)
part_length = ceil(len(body) / num_body_parts)
body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)]
for i, body_part in enumerate(body_parts):
new_doc = Document(text=header + body_part.strip(),
doc_id=f"{doc.doc_id}-{i}",
embedding=doc.embedding,
extra_info=doc.extra_info)
new_documents.append(new_doc)
return new_documents
#Splits all files in specified folder to documents
@app.command()
def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
@@ -56,6 +111,11 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive,
required_exts=formats, num_files_limit=limit,
exclude_hidden=exclude).load_data()
raw_docs = group_documents(raw_docs)
raw_docs = split_documents(raw_docs)
print(raw_docs)
raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
# Here we split the documents, as needed, into smaller chunks.
# We do this due to the context limits of the LLMs.
@@ -109,3 +169,5 @@ def convert(dir: Optional[str] = typer.Option("inputs",
transform_to_docs(functions_dict, classes_dict, formats, dir)
if __name__ == "__main__":
app()