diff --git a/scripts/ingest.py b/scripts/ingest.py index 19c6cf5f..e8082c5d 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -5,12 +5,8 @@ import dotenv import typer from collections import defaultdict -from pathlib import Path from typing import List, Optional -from langchain.text_splitter import RecursiveCharacterTextSplitter - - from parser.file.bulk import SimpleDirectoryReader from parser.schema.base import Document from parser.open_ai_func import call_openai_api, get_user_permission @@ -39,14 +35,17 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, file: Optional[List[str]] = typer.Option(None, help="""File paths to use (Optional; overrides dir). E.g. --file inputs/1.md --file inputs/2.md"""), - recursive: Optional[bool] = typer.Option(True, - help="Whether to recursively search in subdirectories."), - limit: Optional[int] = typer.Option(None, - help="Maximum number of files to read."), + recursive: Optional[bool] = typer.Option(True, help="Whether to recursively search in subdirectories."), + limit: Optional[int] = typer.Option(None, help="Maximum number of files to read."), formats: Optional[List[str]] = typer.Option([".rst", ".md"], help="""List of required extensions (list with .) Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""), - exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")): + exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."), + sample: Optional[bool] = typer.Option(False, help="Whether to output sample of the first 5 split documents."), + token_check: Optional[bool] = typer.Option(True, help="Whether to group small documents and split large."), + min_tokens: Optional[int] = typer.Option(150, help="Minimum number of tokens to not group."), + max_tokens: Optional[int] = typer.Option(2000, help="Maximum number of tokens to not split."), + ): """ Creates index from specified location or files. @@ -57,16 +56,22 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive, required_exts=formats, num_files_limit=limit, exclude_hidden=exclude).load_data() - #Checking min_tokens and max_tokens - raw_docs = group_split(documents=raw_docs) - docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] # Here we split the documents, as needed, into smaller chunks. # We do this due to the context limits of the LLMs. - + raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) + #Old method # text_splitter = RecursiveCharacterTextSplitter() # docs = text_splitter.split_documents(raw_docs) + #Sample feature + if sample == True: + for i in range(min(5, len(raw_docs))): + print(raw_docs[i].text) + + docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + + # Here we check for command line arguments for bot calls. # If no argument exists or the yes is not True, then the # user permission is requested to call the API. diff --git a/scripts/parser/token_func.py b/scripts/parser/token_func.py index 3a44d4e1..be619522 100644 --- a/scripts/parser/token_func.py +++ b/scripts/parser/token_func.py @@ -13,7 +13,7 @@ def separate_header_and_body(text): body = text[len(header):] return header, body -def group_documents(documents: List[Document], min_tokens: int = 200, max_tokens: int = 2000) -> List[Document]: +def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]: docs = [] current_group = None @@ -35,7 +35,7 @@ def group_documents(documents: List[Document], min_tokens: int = 200, max_tokens return docs -def split_documents(documents: List[Document], max_tokens: int = 2000) -> List[Document]: +def split_documents(documents: List[Document], max_tokens: int) -> List[Document]: docs = [] for doc in documents: token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) @@ -54,7 +54,8 @@ def split_documents(documents: List[Document], max_tokens: int = 2000) -> List[D docs.append(new_doc) return docs -def group_split(documents: List[Document], max_tokens: int = 1500, min_tokens: int = 500, token_check: bool = True): +def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True): + print(max_tokens, min_tokens, token_check) if token_check == False: return documents print("Grouping small documents") diff --git a/scripts/test_ingestion.py b/scripts/test_ingestion.py deleted file mode 100644 index 6e3723e3..00000000 --- a/scripts/test_ingestion.py +++ /dev/null @@ -1,19 +0,0 @@ -import os - -import dotenv -import tiktoken -from langchain import FAISS -from langchain.embeddings import OpenAIEmbeddings - -dotenv.load_dotenv() -embeddings_key = os.getenv("API_KEY") -docsearch = FAISS.load_local('outputs', OpenAIEmbeddings(openai_api_key=embeddings_key)) - -d1 = docsearch.similarity_search("Whats new in 1.5.3?") -print(d1) -print("=====================================") -print("=====================================") -for i in d1: - print("docs length (tokens)") - doc_len = len(tiktoken.get_encoding("cl100k_base").encode(i.page_content)) - print(doc_len)