token ingeest

This commit is contained in:
Pavel
2023-03-14 13:32:29 +04:00
parent 20a0800aa7
commit b6c02c850a
5 changed files with 80 additions and 78 deletions

View File

@@ -1,20 +1,16 @@
import os
import re
import sys
import nltk
import dotenv
import typer
import ast
import tiktoken
from math import ceil
from collections import defaultdict
from pathlib import Path
from typing import List, Optional, Tuple
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from parser.file.bulk import SimpleDirectoryReader
from parser.schema.base import Document
from parser.open_ai_func import call_openai_api, get_user_permission
@@ -22,6 +18,7 @@ from parser.py2doc import transform_to_docs
from parser.py2doc import extract_functions_and_classes as extract_py
from parser.js2doc import extract_functions_and_classes as extract_js
from parser.java2doc import extract_functions_and_classes as extract_java
from parser.token_func import group_split
dotenv.load_dotenv()
@@ -32,57 +29,6 @@ nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
def group_documents(documents: List[Document], min_tokens: int = 50, max_tokens: int = 2000) -> List[Document]:
groups = []
current_group = None
for doc in documents:
doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
if current_group is None:
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
current_group.text += " " + doc.text
else:
groups.append(current_group)
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
if current_group is not None:
groups.append(current_group)
return groups
def separate_header_and_body(text):
header_pattern = r"^(.*?\n){3}"
match = re.match(header_pattern, text)
header = match.group(0)
body = text[len(header):]
return header, body
def split_documents(documents: List[Document], max_tokens: int = 2000) -> List[Document]:
new_documents = []
for doc in documents:
token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
print(token_length)
if token_length <= max_tokens:
new_documents.append(doc)
else:
header, body = separate_header_and_body(doc.text)
num_body_parts = ceil(token_length / max_tokens)
part_length = ceil(len(body) / num_body_parts)
body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)]
for i, body_part in enumerate(body_parts):
new_doc = Document(text=header + body_part.strip(),
doc_id=f"{doc.doc_id}-{i}",
embedding=doc.embedding,
extra_info=doc.extra_info)
new_documents.append(new_doc)
return new_documents
#Splits all files in specified folder to documents
@app.command()
def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
@@ -111,16 +57,15 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive,
required_exts=formats, num_files_limit=limit,
exclude_hidden=exclude).load_data()
#Checking min_tokens and max_tokens
raw_docs = group_split(documents=raw_docs)
raw_docs = group_documents(raw_docs)
raw_docs = split_documents(raw_docs)
print(raw_docs)
raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
# Here we split the documents, as needed, into smaller chunks.
# We do this due to the context limits of the LLMs.
text_splitter = RecursiveCharacterTextSplitter()
docs = text_splitter.split_documents(raw_docs)
# text_splitter = RecursiveCharacterTextSplitter()
# docs = text_splitter.split_documents(raw_docs)
# Here we check for command line arguments for bot calls.
# If no argument exists or the yes is not True, then the