diff --git a/scripts/ingest.py b/scripts/ingest.py index 66af137c..19c6cf5f 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -1,20 +1,16 @@ import os -import re import sys import nltk import dotenv import typer -import ast -import tiktoken -from math import ceil - from collections import defaultdict from pathlib import Path -from typing import List, Optional, Tuple +from typing import List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter + from parser.file.bulk import SimpleDirectoryReader from parser.schema.base import Document from parser.open_ai_func import call_openai_api, get_user_permission @@ -22,6 +18,7 @@ from parser.py2doc import transform_to_docs from parser.py2doc import extract_functions_and_classes as extract_py from parser.js2doc import extract_functions_and_classes as extract_js from parser.java2doc import extract_functions_and_classes as extract_java +from parser.token_func import group_split dotenv.load_dotenv() @@ -32,57 +29,6 @@ nltk.download('punkt', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) -def group_documents(documents: List[Document], min_tokens: int = 50, max_tokens: int = 2000) -> List[Document]: - groups = [] - current_group = None - - for doc in documents: - doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) - - if current_group is None: - current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, - extra_info=doc.extra_info) - elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens: - current_group.text += " " + doc.text - else: - groups.append(current_group) - current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, - extra_info=doc.extra_info) - - if current_group is not None: - groups.append(current_group) - - return groups - - -def separate_header_and_body(text): - header_pattern = r"^(.*?\n){3}" - match = re.match(header_pattern, text) - header = match.group(0) - body = text[len(header):] - return header, body - -def split_documents(documents: List[Document], max_tokens: int = 2000) -> List[Document]: - new_documents = [] - for doc in documents: - token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) - print(token_length) - if token_length <= max_tokens: - new_documents.append(doc) - else: - header, body = separate_header_and_body(doc.text) - num_body_parts = ceil(token_length / max_tokens) - part_length = ceil(len(body) / num_body_parts) - body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)] - for i, body_part in enumerate(body_parts): - new_doc = Document(text=header + body_part.strip(), - doc_id=f"{doc.doc_id}-{i}", - embedding=doc.embedding, - extra_info=doc.extra_info) - new_documents.append(new_doc) - return new_documents - - #Splits all files in specified folder to documents @app.command() def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, @@ -111,16 +57,15 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive, required_exts=formats, num_files_limit=limit, exclude_hidden=exclude).load_data() + #Checking min_tokens and max_tokens + raw_docs = group_split(documents=raw_docs) - raw_docs = group_documents(raw_docs) - raw_docs = split_documents(raw_docs) - - print(raw_docs) - raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] # Here we split the documents, as needed, into smaller chunks. # We do this due to the context limits of the LLMs. - text_splitter = RecursiveCharacterTextSplitter() - docs = text_splitter.split_documents(raw_docs) + + # text_splitter = RecursiveCharacterTextSplitter() + # docs = text_splitter.split_documents(raw_docs) # Here we check for command line arguments for bot calls. # If no argument exists or the yes is not True, then the diff --git a/scripts/outputs/v1/index.pkl b/scripts/outputs/v1/index.pkl new file mode 100644 index 00000000..d98caa79 Binary files /dev/null and b/scripts/outputs/v1/index.pkl differ diff --git a/scripts/parser/file/rst_parser.py b/scripts/parser/file/rst_parser.py index a130572d..1719b84c 100644 --- a/scripts/parser/file/rst_parser.py +++ b/scripts/parser/file/rst_parser.py @@ -29,7 +29,6 @@ class RstParser(BaseParser): remove_whitespaces_excess: bool = True, #Be carefull with remove_characters_excess, might cause data loss remove_characters_excess: bool = True, - # max_tokens: int = 2048, **kwargs: Any, ) -> None: """Init params.""" @@ -41,18 +40,6 @@ class RstParser(BaseParser): self._remove_directives = remove_directives self._remove_whitespaces_excess = remove_whitespaces_excess self._remove_characters_excess = remove_characters_excess - # self._max_tokens = max_tokens - - # def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str): - # """Append to tups chunk.""" - # num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text)) - # if num_tokens > self._max_tokens: - # chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)] - # for chunk in chunks: - # tups.append((current_header, chunk)) - # else: - # tups.append((current_header, current_text)) - # return tups def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]: diff --git a/scripts/parser/token_func.py b/scripts/parser/token_func.py new file mode 100644 index 00000000..3a44d4e1 --- /dev/null +++ b/scripts/parser/token_func.py @@ -0,0 +1,70 @@ +import re +import tiktoken + +from typing import List +from parser.schema.base import Document +from math import ceil + + +def separate_header_and_body(text): + header_pattern = r"^(.*?\n){3}" + match = re.match(header_pattern, text) + header = match.group(0) + body = text[len(header):] + return header, body + +def group_documents(documents: List[Document], min_tokens: int = 200, max_tokens: int = 2000) -> List[Document]: + docs = [] + current_group = None + + for doc in documents: + doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) + + if current_group is None: + current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, + extra_info=doc.extra_info) + elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens: + current_group.text += " " + doc.text + else: + docs.append(current_group) + current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, + extra_info=doc.extra_info) + + if current_group is not None: + docs.append(current_group) + + return docs + +def split_documents(documents: List[Document], max_tokens: int = 2000) -> List[Document]: + docs = [] + for doc in documents: + token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) + if token_length <= max_tokens: + docs.append(doc) + else: + header, body = separate_header_and_body(doc.text) + num_body_parts = ceil(token_length / max_tokens) + part_length = ceil(len(body) / num_body_parts) + body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)] + for i, body_part in enumerate(body_parts): + new_doc = Document(text=header + body_part.strip(), + doc_id=f"{doc.doc_id}-{i}", + embedding=doc.embedding, + extra_info=doc.extra_info) + docs.append(new_doc) + return docs + +def group_split(documents: List[Document], max_tokens: int = 1500, min_tokens: int = 500, token_check: bool = True): + if token_check == False: + return documents + print("Grouping small documents") + try: + documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens) + except: + print("Grouping failed, try running without token_check") + print("Separating large documents") + try: + documents = split_documents(documents=documents, max_tokens=max_tokens) + except: + print("Grouping failed, try running without token_check") + return documents diff --git a/scripts/test_ingestion.py b/scripts/test_ingestion.py index e4aad822..6e3723e3 100644 --- a/scripts/test_ingestion.py +++ b/scripts/test_ingestion.py @@ -7,7 +7,7 @@ from langchain.embeddings import OpenAIEmbeddings dotenv.load_dotenv() embeddings_key = os.getenv("API_KEY") -docsearch = FAISS.load_local('outputs/inputs', OpenAIEmbeddings(openai_api_key=embeddings_key)) +docsearch = FAISS.load_local('outputs', OpenAIEmbeddings(openai_api_key=embeddings_key)) d1 = docsearch.similarity_search("Whats new in 1.5.3?") print(d1)