feat: edit deploymen files locations

2026-03-05 13:23:46 +00:00 · 2025-02-05 18:04:41 +00:00
parent d754a43fba
commit 0913c43219
48 changed files with 21 additions and 1927 deletions
--- a/scripts/init.py
+++ b/scripts/init.py
--- a/scripts/code_docs_gen.py
+++ b/scripts/code_docs_gen.py
@@ -1,95 +0,0 @@
-import ast
-import json
-from pathlib import Path
-
-import dotenv
-from langchain_community.llms import OpenAI
-from langchain.prompts import PromptTemplate
-
-dotenv.load_dotenv()
-
-ps = list(Path("inputs").glob("**/*.py"))
-data = []
-sources = []
-for p in ps:
-    with open(p) as f:
-        data.append(f.read())
-    sources.append(p)
-
-
-def get_functions_in_class(node):
-    functions = []
-    functions_code = []
-    for child in node.body:
-        if isinstance(child, ast.FunctionDef):
-            functions.append(child.name)
-            functions_code.append(ast.unparse(child))
-
-    return functions, functions_code
-
-
-def get_classes_and_functions(source_code):
-    tree = ast.parse(source_code)
-    classes = {}
-    for node in tree.body:
-        if isinstance(node, ast.ClassDef):
-            class_name = node.name
-            function_name, function = get_functions_in_class(node)
-            # join function name and function code
-            functions = dict(zip(function_name, function))
-            classes[class_name] = functions
-    return classes
-
-
-structure_dict = {}
-c1 = 0
-for code in data:
-    classes = get_classes_and_functions(ast.parse(code))
-    source = str(sources[c1])
-    structure_dict[source] = classes
-    c1 += 1
-
-# save the structure dict as json
-with open('structure_dict.json', 'w') as f:
-    json.dump(structure_dict, f)
-
-if not Path("outputs").exists():
-    Path("outputs").mkdir()
-
-c1 = len(structure_dict)
-c2 = 0
-for source, classes in structure_dict.items():
-    c2 += 1
-    print(f"Processing file {c2}/{c1}")
-    f1 = len(classes)
-    f2 = 0
-    for class_name, functions in classes.items():
-        f2 += 1
-        print(f"Processing class {f2}/{f1}")
-        source_w = source.replace("inputs/", "")
-        source_w = source_w.replace(".py", ".txt")
-        if not Path(f"outputs/{source_w}").exists():
-            with open(f"outputs/{source_w}", "w") as f:
-                f.write(f"Class: {class_name}")
-        else:
-            with open(f"outputs/{source_w}", "a") as f:
-                f.write(f"\n\nClass: {class_name}")
-        # append class name to the front
-        for function in functions:
-            b1 = len(functions)
-            b2 = 0
-            print(f"Processing function {b2}/{b1}")
-            b2 += 1
-            prompt = PromptTemplate(
-                input_variables=["code"],
-                template="Code: \n{code}, \nDocumentation: ",
-            )
-            llm = OpenAI(temperature=0)
-            response = llm(prompt.format(code=functions[function]))
-
-            if not Path(f"outputs/{source_w}").exists():
-                with open(f"outputs/{source_w}", "w") as f:
-                    f.write(f"Function: {functions[function]}, \nDocumentation: {response}")
-            else:
-                with open(f"outputs/{source_w}", "a") as f:
-                    f.write(f"\n\nFunction: {functions[function]}, \nDocumentation: {response}")
--- a/scripts/ingest.py
+++ b/scripts/ingest.py
@@ -1,128 +0,0 @@
-import os
-import sys
-from collections import defaultdict
-from typing import List, Optional
-
-import dotenv
-import nltk
-import typer
-
-from parser.file.bulk import SimpleDirectoryReader
-from parser.java2doc import extract_functions_and_classes as extract_java
-from parser.js2doc import extract_functions_and_classes as extract_js
-from parser.open_ai_func import call_openai_api, get_user_permission
-from parser.py2doc import extract_functions_and_classes as extract_py
-from parser.py2doc import transform_to_docs
-from parser.schema.base import Document
-from parser.token_func import group_split
-
-dotenv.load_dotenv()
-
-app = typer.Typer(add_completion=False)
-
-nltk.download('punkt', quiet=True)
-nltk.download('averaged_perceptron_tagger', quiet=True)
-
-
-def metadata_from_filename(title):
-    return {'title': title}
-
-# Splits all files in specified folder to documents
-@app.command()
-def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
-                                    help="Whether to skip price confirmation"),
-           dir: Optional[List[str]] = typer.Option(["inputs"],
-                                                   help="""List of paths to directory for index creation.
-                                                        E.g. --dir inputs --dir inputs2"""),
-           file: Optional[List[str]] = typer.Option(None,
-                                                    help="""File paths to use (Optional; overrides dir).
-                                                        E.g. --file inputs/1.md --file inputs/2.md"""),
-           recursive: Optional[bool] = typer.Option(True, help="Whether to recursively search in subdirectories."),
-           limit: Optional[int] = typer.Option(None, help="Maximum number of files to read."),
-           formats: Optional[List[str]] = typer.Option([".rst", ".md"],
-                                                       help="""List of required extensions (list with .)
-                                                        Currently supported: 
-                                                        .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
-           exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."),
-           sample: Optional[bool] = typer.Option(False,
-                                                 help="Whether to output sample of the first 5 split documents."),
-           token_check: Optional[bool] = typer.Option(True, help="Whether to group small documents and split large."),
-           min_tokens: Optional[int] = typer.Option(150, help="Minimum number of tokens to not group."),
-           max_tokens: Optional[int] = typer.Option(2000, help="Maximum number of tokens to not split."),
-           ):
-    """
-        Creates index from specified location or files.
-        By default /inputs folder is used, .rst and .md are parsed.
-    """
-
-    def process_one_docs(directory, folder_name):
-        raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive,
-                                         required_exts=formats, num_files_limit=limit,
-                                         exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
-
-        # Here we split the documents, as needed, into smaller chunks.
-        # We do this due to the context limits of the LLMs.
-        raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens,
-                               token_check=token_check)
-        # Old method
-        # text_splitter = RecursiveCharacterTextSplitter()
-        # docs = text_splitter.split_documents(raw_docs)
-
-        # Sample feature
-        if sample:
-            for i in range(min(5, len(raw_docs))):
-                print(raw_docs[i].text)
-
-        docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
-
-        # Here we check for command line arguments for bot calls.
-        # If no argument exists or the yes is not True, then the
-        # user permission is requested to call the API.
-        if len(sys.argv) > 1 and yes:
-            call_openai_api(docs, folder_name)
-        else:
-            get_user_permission(docs, folder_name)
-
-
-    folder_counts = defaultdict(int)
-    folder_names = []
-    for dir_path in dir:
-        folder_name = os.path.basename(os.path.normpath(dir_path))
-        folder_counts[folder_name] += 1
-        if folder_counts[folder_name] > 1:
-            folder_name = f"{folder_name}_{folder_counts[folder_name]}"
-        folder_names.append(folder_name)
-
-    for directory, folder_name in zip(dir, folder_names):
-        process_one_docs(directory, folder_name)
-
-
-@app.command()
-def convert(dir: Optional[str] = typer.Option("inputs",
-                                              help="""Path to directory to make documentation for.
-                                                        E.g. --dir inputs """),
-            formats: Optional[str] = typer.Option("py",
-                                                  help="""Required language. 
-                                                        py, js, java supported for now""")):
-    """
-            Creates documentation linked to original functions from specified location.
-            By default /inputs folder is used, .py is parsed.
-    """
-    # Using a dictionary to map between the formats and their respective extraction functions
-    # makes the code more scalable. When adding more formats in the future, 
-    # you only need to update the extraction_functions dictionary.
-    extraction_functions = {
-    'py': extract_py,
-    'js': extract_js,
-    'java': extract_java
-    }
-
-    if formats in extraction_functions:
-        functions_dict, classes_dict = extraction_functions[formats](dir)
-    else:
-        raise Exception("Sorry, language not supported yet")                                   
-    transform_to_docs(functions_dict, classes_dict, formats, dir)
-
-
-if __name__ == "__main__":
-    app()
--- a/scripts/old/init.py
+++ b/scripts/old/init.py
--- a/scripts/old/ingest_rst.py
+++ b/scripts/old/ingest_rst.py
@@ -1,90 +0,0 @@
-import pickle
-import sys
-from argparse import ArgumentParser
-from pathlib import Path
-
-import dotenv
-import faiss
-import tiktoken
-from langchain_openai import OpenAIEmbeddings
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.vectorstores import FAISS
-
-
-def num_tokens_from_string(string: str, encoding_name: str) -> int:
-    # Function to convert string to tokens and estimate user cost.
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    total_price = ((num_tokens / 1000) * 0.0004)
-    return num_tokens, total_price
-
-
-def call_openai_api():
-    # Function to create a vector store from the documents and save it to disk.
-    store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
-    faiss.write_index(store.index, "docs.index")
-    store.index = None
-    with open("faiss_store.pkl", "wb") as f:
-        pickle.dump(store, f)
-
-
-def get_user_permission():
-    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
-    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
-    docs_content = (" ".join(docs))
-    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
-    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
-    print(f"Number of Tokens = {format(tokens, ',d')}")
-    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    # Here we check for user permission before calling the API.
-    user_input = input("Price Okay? (Y/N) \n").lower()
-    if user_input == "y":
-        call_openai_api()
-    elif user_input == "":
-        call_openai_api()
-    else:
-        print("The API was not called. No money was spent.")
-
-
-# Load .env file
-dotenv.load_dotenv()
-
-ap = ArgumentParser("Script for training DocsGPT on .rst documentation files.")
-ap.add_argument("-i", "--inputs",
-                type=str,
-                default="inputs",
-                help="Directory containing documentation files")
-args = ap.parse_args()
-
-# Here we load in the data in the format that Notion exports it in.
-ps = list(Path(args.inputs).glob("**/*.rst"))
-
-# parse all child directories
-data = []
-sources = []
-for p in ps:
-    with open(p) as f:
-        data.append(f.read())
-    sources.append(p)
-
-# Here we split the documents, as needed, into smaller chunks.
-# We do this due to the context limits of the LLMs.
-text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
-docs = []
-metadatas = []
-for i, d in enumerate(data):
-    splits = text_splitter.split_text(d)
-    docs.extend(splits)
-    metadatas.extend([{"source": sources[i]}] * len(splits))
-
-# Here we check for command line arguments for bot calls.
-# If no argument exists or the permission_bypass_flag argument is not '-y',
-# user permission is requested to call the API.
-if len(sys.argv) > 1:
-    permission_bypass_flag = sys.argv[1]
-    if permission_bypass_flag == '-y':
-        call_openai_api()
-    else:
-        get_user_permission()
-else:
-    get_user_permission()
--- a/scripts/old/ingest_rst_sphinx.py
+++ b/scripts/old/ingest_rst_sphinx.py
@@ -1,133 +0,0 @@
-import os
-import pickle
-import shutil
-import sys
-from argparse import ArgumentParser
-from pathlib import Path
-
-import dotenv
-import faiss
-import tiktoken
-from langchain_openai import OpenAIEmbeddings
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.vectorstores import FAISS
-from sphinx.cmd.build import main as sphinx_main
-
-
-def convert_rst_to_txt(src_dir, dst_dir):
-    # Check if the source directory exists
-    if not os.path.exists(src_dir):
-        raise Exception("Source directory does not exist")
-    # Walk through the source directory
-    for root, dirs, files in os.walk(src_dir):
-        for file in files:
-            # Check if the file has .rst extension
-            if file.endswith(".rst"):
-                # Construct the full path of the file
-                src_file = os.path.join(root, file.replace(".rst", ""))
-                # Convert the .rst file to .txt file using sphinx-build
-                args = f". -b text -D extensions=sphinx.ext.autodoc " \
-                       f"-D master_doc={src_file} " \
-                       f"-D source_suffix=.rst " \
-                       f"-C {dst_dir} "
-                sphinx_main(args.split())
-            elif file.endswith(".md"):
-                # Rename the .md file to .rst file
-                src_file = os.path.join(root, file)
-                dst_file = os.path.join(root, file.replace(".md", ".rst"))
-                os.rename(src_file, dst_file)
-                # Convert the .rst file to .txt file using sphinx-build
-                args = f". -b text -D extensions=sphinx.ext.autodoc " \
-                       f"-D master_doc={dst_file} " \
-                       f"-D source_suffix=.rst " \
-                       f"-C {dst_dir} "
-                sphinx_main(args.split())
-
-
-def num_tokens_from_string(string: str, encoding_name: str) -> int:
-    # Function to convert string to tokens and estimate user cost.
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    total_price = ((num_tokens / 1000) * 0.0004)
-    return num_tokens, total_price
-
-
-def call_openai_api():
-    # Function to create a vector store from the documents and save it to disk.
-    store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
-    faiss.write_index(store.index, "docs.index")
-    store.index = None
-    with open("faiss_store.pkl", "wb") as f:
-        pickle.dump(store, f)
-
-
-def get_user_permission():
-    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
-    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
-    docs_content = (" ".join(docs))
-    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
-    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
-    print(f"Number of Tokens = {format(tokens, ',d')}")
-    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    # Here we check for user permission before calling the API.
-    user_input = input("Price Okay? (Y/N) \n").lower()
-    if user_input == "y":
-        call_openai_api()
-    elif user_input == "":
-        call_openai_api()
-    else:
-        print("The API was not called. No money was spent.")
-
-
-ap = ArgumentParser("Script for training DocsGPT on Sphinx documentation")
-ap.add_argument("-i", "--inputs",
-                type=str,
-                default="inputs",
-                help="Directory containing documentation files")
-args = ap.parse_args()
-
-# Load .env file
-dotenv.load_dotenv()
-
-# Directory to vector
-src_dir = args.inputs
-dst_dir = "tmp"
-
-convert_rst_to_txt(src_dir, dst_dir)
-
-# Here we load in the data in the format that Notion exports it in.
-ps = list(Path("tmp/" + src_dir).glob("**/*.txt"))
-
-# parse all child directories
-data = []
-sources = []
-for p in ps:
-    with open(p) as f:
-        data.append(f.read())
-    sources.append(p)
-
-# Here we split the documents, as needed, into smaller chunks.
-# We do this due to the context limits of the LLMs.
-text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
-docs = []
-metadatas = []
-for i, d in enumerate(data):
-    splits = text_splitter.split_text(d)
-    docs.extend(splits)
-    metadatas.extend([{"source": sources[i]}] * len(splits))
-
-# Here we check for command line arguments for bot calls.
-# If no argument exists or the permission_bypass_flag argument is not '-y',
-# user permission is requested to call the API.
-if len(sys.argv) > 1:
-    permission_bypass_flag = sys.argv[1]
-    if permission_bypass_flag == '-y':
-        call_openai_api()
-    else:
-        get_user_permission()
-else:
-    get_user_permission()
-
-# Delete tmp folder
-# Commented out for now
-shutil.rmtree(dst_dir)
--- a/scripts/parser/init.py
+++ b/scripts/parser/init.py
@@ -1 +0,0 @@
-
--- a/scripts/parser/file/init.py
+++ b/scripts/parser/file/init.py
--- a/scripts/parser/file/base.py
+++ b/scripts/parser/file/base.py
@@ -1,19 +0,0 @@
-"""Base reader class."""
-from abc import abstractmethod
-from typing import Any, List
-
-from langchain.docstore.document import Document as LCDocument
-from parser.schema.base import Document
-
-
-class BaseReader:
-    """Utilities for loading data from a directory."""
-
-    @abstractmethod
-    def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
-        """Load data from the input directory."""
-
-    def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
-        """Load data in LangChain document format."""
-        docs = self.load_data(**load_kwargs)
-        return [d.to_langchain_format() for d in docs]
--- a/scripts/parser/file/base_parser.py
+++ b/scripts/parser/file/base_parser.py
@@ -1,38 +0,0 @@
-"""Base parser and config class."""
-
-from abc import abstractmethod
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-
-class BaseParser:
-    """Base class for all parsers."""
-
-    def __init__(self, parser_config: Optional[Dict] = None):
-        """Init params."""
-        self._parser_config = parser_config
-
-    def init_parser(self) -> None:
-        """Init parser and store it."""
-        parser_config = self._init_parser()
-        self._parser_config = parser_config
-
-    @property
-    def parser_config_set(self) -> bool:
-        """Check if parser config is set."""
-        return self._parser_config is not None
-
-    @property
-    def parser_config(self) -> Dict:
-        """Check if parser config is set."""
-        if self._parser_config is None:
-            raise ValueError("Parser config not set.")
-        return self._parser_config
-
-    @abstractmethod
-    def _init_parser(self) -> Dict:
-        """Initialize the parser with the config."""
-
-    @abstractmethod
-    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
-        """Parse file."""
--- a/scripts/parser/file/bulk.py
+++ b/scripts/parser/file/bulk.py
@@ -1,167 +0,0 @@
-"""Simple reader that reads files of different formats from a directory."""
-import logging
-from parser.file.base import BaseReader
-from parser.file.base_parser import BaseParser
-from parser.file.docs_parser import DocxParser, PDFParser
-from parser.file.epub_parser import EpubParser
-from parser.file.html_parser import HTMLParser
-from parser.file.markdown_parser import MarkdownParser
-from parser.file.rst_parser import RstParser
-from parser.file.tabular_parser import PandasCSVParser
-from parser.schema.base import Document
-from pathlib import Path
-from typing import Callable, Dict, List, Optional, Union
-
-DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
-    ".pdf": PDFParser(),
-    ".docx": DocxParser(),
-    ".csv": PandasCSVParser(),
-    ".epub": EpubParser(),
-    ".md": MarkdownParser(),
-    ".rst": RstParser(),
-    ".html": HTMLParser(),
-    ".mdx": MarkdownParser(),
-}
-
-
-class SimpleDirectoryReader(BaseReader):
-    """Simple directory reader.
-
-    Can read files into separate documents, or concatenates
-    files into one document text.
-
-    Args:
-        input_dir (str): Path to the directory.
-        input_files (List): List of file paths to read (Optional; overrides input_dir)
-        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
-        errors (str): how encoding and decoding errors are to be handled,
-              see https://docs.python.org/3/library/functions.html#open
-        recursive (bool): Whether to recursively search in subdirectories.
-            False by default.
-        required_exts (Optional[List[str]]): List of required extensions.
-            Default is None.
-        file_extractor (Optional[Dict[str, BaseParser]]): A mapping of file
-            extension to a BaseParser class that specifies how to convert that file
-            to text. See DEFAULT_FILE_EXTRACTOR.
-        num_files_limit (Optional[int]): Maximum number of files to read.
-            Default is None.
-        file_metadata (Optional[Callable[str, Dict]]): A function that takes
-            in a filename and returns a Dict of metadata for the Document.
-            Default is None.
-    """
-
-    def __init__(
-            self,
-            input_dir: Optional[str] = None,
-            input_files: Optional[List] = None,
-            exclude_hidden: bool = True,
-            errors: str = "ignore",
-            recursive: bool = True,
-            required_exts: Optional[List[str]] = None,
-            file_extractor: Optional[Dict[str, BaseParser]] = None,
-            num_files_limit: Optional[int] = None,
-            file_metadata: Optional[Callable[[str], Dict]] = None,
-    ) -> None:
-        """Initialize with parameters."""
-        super().__init__()
-
-        if not input_dir and not input_files:
-            raise ValueError("Must provide either `input_dir` or `input_files`.")
-
-        self.errors = errors
-
-        self.recursive = recursive
-        self.exclude_hidden = exclude_hidden
-        self.required_exts = required_exts
-        self.num_files_limit = num_files_limit
-        print("input_files")
-        print(input_files)
-
-        if input_files:
-            self.input_files = []
-            for path in input_files:
-                input_file = Path(path)
-                self.input_files.append(input_file)
-        elif input_dir:
-            self.input_dir = Path(input_dir)
-            self.input_files = self._add_files(self.input_dir)
-
-        self.file_extractor = file_extractor or DEFAULT_FILE_EXTRACTOR
-        self.file_metadata = file_metadata
-
-    def _add_files(self, input_dir: Path) -> List[Path]:
-        """Add files."""
-        input_files = sorted(input_dir.iterdir())
-        new_input_files = []
-        dirs_to_explore = []
-        for input_file in input_files:
-            if input_file.is_dir():
-                if self.recursive:
-                    dirs_to_explore.append(input_file)
-            elif self.exclude_hidden and input_file.name.startswith("."):
-                continue
-            elif (
-                    self.required_exts is not None
-                    and input_file.suffix not in self.required_exts
-            ):
-                continue
-            else:
-                new_input_files.append(input_file)
-
-        for dir_to_explore in dirs_to_explore:
-            sub_input_files = self._add_files(dir_to_explore)
-            new_input_files.extend(sub_input_files)
-
-        if self.num_files_limit is not None and self.num_files_limit > 0:
-            new_input_files = new_input_files[0: self.num_files_limit]
-
-        # print total number of files added
-        logging.debug(
-            f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}"
-        )
-
-        return new_input_files
-
-    def load_data(self, concatenate: bool = False) -> List[Document]:
-        """Load data from the input directory.
-
-        Args:
-            concatenate (bool): whether to concatenate all files into one document.
-                If set to True, file metadata is ignored.
-                False by default.
-
-        Returns:
-            List[Document]: A list of documents.
-
-        """
-        data: Union[str, List[str]] = ""
-        data_list: List[str] = []
-        metadata_list = []
-        for input_file in self.input_files:
-            if input_file.suffix in self.file_extractor:
-                parser = self.file_extractor[input_file.suffix]
-                if not parser.parser_config_set:
-                    parser.init_parser()
-                data = parser.parse_file(input_file, errors=self.errors)
-            else:
-                # do standard read
-                with open(input_file, "r", errors=self.errors) as f:
-                    data = f.read()
-            if isinstance(data, List):
-                data_list.extend(data)
-                if self.file_metadata is not None:
-                    for _ in range(len(data)):
-                        metadata_list.append(self.file_metadata(str(input_file)))
-            else:
-                data_list.append(str(data))
-                if self.file_metadata is not None:
-                    metadata_list.append(self.file_metadata(str(input_file)))
-
-            
-
-        if concatenate:
-            return [Document("\n".join(data_list))]
-        elif self.file_metadata is not None:
-            return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)]
-        else:
-            return [Document(d) for d in data_list]
--- a/scripts/parser/file/docs_parser.py
+++ b/scripts/parser/file/docs_parser.py
@@ -1,59 +0,0 @@
-"""Docs parser.
-
-Contains parsers for docx, pdf files.
-
-"""
-from pathlib import Path
-from typing import Dict
-
-from parser.file.base_parser import BaseParser
-
-
-class PDFParser(BaseParser):
-    """PDF parser."""
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> str:
-        """Parse file."""
-        try:
-            import PyPDF2
-        except ImportError:
-            raise ValueError("PyPDF2 is required to read PDF files.")
-        text_list = []
-        with open(file, "rb") as fp:
-            # Create a PDF object
-            pdf = PyPDF2.PdfReader(fp)
-
-            # Get the number of pages in the PDF document
-            num_pages = len(pdf.pages)
-
-            # Iterate over every page
-            for page in range(num_pages):
-                # Extract the text from the page
-                page_text = pdf.pages[page].extract_text()
-                text_list.append(page_text)
-        text = "\n".join(text_list)
-
-        return text
-
-
-class DocxParser(BaseParser):
-    """Docx parser."""
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> str:
-        """Parse file."""
-        try:
-            import docx2txt
-        except ImportError:
-            raise ValueError("docx2txt is required to read Microsoft Word files.")
-
-        text = docx2txt.process(file)
-
-        return text
--- a/scripts/parser/file/epub_parser.py
+++ b/scripts/parser/file/epub_parser.py
@@ -1,43 +0,0 @@
-"""Epub parser.
-
-Contains parsers for epub files.
-"""
-
-from pathlib import Path
-from typing import Dict
-
-from parser.file.base_parser import BaseParser
-
-
-class EpubParser(BaseParser):
-    """Epub Parser."""
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> str:
-        """Parse file."""
-        try:
-            import ebooklib
-            from ebooklib import epub
-        except ImportError:
-            raise ValueError("`EbookLib` is required to read Epub files.")
-        try:
-            import html2text
-        except ImportError:
-            raise ValueError("`html2text` is required to parse Epub files.")
-
-        text_list = []
-        book = epub.read_epub(file, options={"ignore_ncx": True})
-
-        # Iterate through all chapters.
-        for item in book.get_items():
-            # Chapters are typically located in epub documents items.
-            if item.get_type() == ebooklib.ITEM_DOCUMENT:
-                text_list.append(
-                    html2text.html2text(item.get_content().decode("utf-8"))
-                )
-
-        text = "\n".join(text_list)
-        return text
--- a/scripts/parser/file/html_parser.py
+++ b/scripts/parser/file/html_parser.py
@@ -1,83 +0,0 @@
-"""HTML parser.
-
-Contains parser for html files.
-
-"""
-import re
-from pathlib import Path
-from typing import Dict, Union
-
-from parser.file.base_parser import BaseParser
-
-
-class HTMLParser(BaseParser):
-    """HTML parser."""
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
-        """Parse file.
-
-            Returns:
-            Union[str, List[str]]: a string or a List of strings.
-        """
-        try:
-            from unstructured.partition.html import partition_html
-            from unstructured.staging.base import convert_to_isd
-            from unstructured.cleaners.core import clean
-        except ImportError:
-            raise ValueError("unstructured package is required to parse HTML files.")
-
-        # Using the unstructured library to convert the html to isd format
-        # isd sample : isd = [
-        #   {"text": "My Title", "type": "Title"},
-        #   {"text": "My Narrative", "type": "NarrativeText"}
-        # ]
-        with open(file, "r", encoding="utf-8") as fp:
-            elements = partition_html(file=fp)
-            isd = convert_to_isd(elements)
-
-        # Removing non ascii charactwers from isd_el['text']
-        for isd_el in isd:
-            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
-
-        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
-        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
-        for isd_el in isd:
-            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
-            isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
-
-        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
-        for isd_el in isd:
-            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
-
-        # Creating a list of all the indexes of isd_el['type'] = 'Title'
-        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
-
-        # Creating 'Chunks' - List of lists of strings 
-        # each list starting with isd_el['type'] = 'Title' and all the data till the next 'Title'
-        # Each Chunk can be thought of as an individual set of data, which can be sent to the model
-        # Where Each Title is grouped together with the data under it
-
-        Chunks = [[]]
-        final_chunks = list(list())
-
-        for i, isd_el in enumerate(isd):
-            if i in title_indexes:
-                Chunks.append([])
-            Chunks[-1].append(isd_el['text'])
-
-        # Removing all the chunks with sum of length of all the strings in the chunk < 25
-        # TODO: This value can be a user defined variable
-        for chunk in Chunks:
-            # sum of length of all the strings in the chunk
-            sum = 0
-            sum += len(str(chunk))
-            if sum < 25:
-                Chunks.remove(chunk)
-            else:
-                # appending all the approved chunks to final_chunks as a single string       
-                final_chunks.append(" ".join([str(item) for item in chunk]))
-        return final_chunks
--- a/scripts/parser/file/markdown_parser.py
+++ b/scripts/parser/file/markdown_parser.py
@@ -1,149 +0,0 @@
-"""Markdown parser.
-
-Contains parser for md files.
-
-"""
-import re
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
-
-import tiktoken
-from parser.file.base_parser import BaseParser
-
-
-class MarkdownParser(BaseParser):
-    """Markdown parser.
-
-    Extract text from markdown files.
-    Returns dictionary with keys as headers and values as the text between headers.
-
-    """
-
-    def __init__(
-            self,
-            *args: Any,
-            remove_hyperlinks: bool = True,
-            remove_images: bool = True,
-            max_tokens: int = 2048,
-            # remove_tables: bool = True,
-            **kwargs: Any,
-    ) -> None:
-        """Init params."""
-        super().__init__(*args, **kwargs)
-        self._remove_hyperlinks = remove_hyperlinks
-        self._remove_images = remove_images
-        self._max_tokens = max_tokens
-        # self._remove_tables = remove_tables
-
-    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
-                          current_text: str):
-        """Append to tups chunk."""
-        num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
-        if num_tokens > self._max_tokens:
-            chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)]
-            for chunk in chunks:
-                tups.append((current_header, chunk))
-        else:
-            tups.append((current_header, current_text))
-        return tups
-
-    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
-        """Convert a markdown file to a dictionary.
-
-        The keys are the headers and the values are the text under each header.
-
-        """
-        markdown_tups: List[Tuple[Optional[str], str]] = []
-        lines = markdown_text.split("\n")
-
-        current_header = None
-        current_text = ""
-
-        for line in lines:
-            header_match = re.match(r"^#+\s", line)
-            if header_match:
-                if current_header is not None:
-                    if current_text == "" or None:
-                        continue
-                    markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text)
-
-                current_header = line
-                current_text = ""
-            else:
-                current_text += line + "\n"
-        markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text)
-
-        if current_header is not None:
-            # pass linting, assert keys are defined
-            markdown_tups = [
-                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
-                for key, value in markdown_tups
-            ]
-        else:
-            markdown_tups = [
-                (key, re.sub("\n", "", value)) for key, value in markdown_tups
-            ]
-
-        return markdown_tups
-
-    def remove_images(self, content: str) -> str:
-        """Get a dictionary of a markdown file from its path."""
-        pattern = r"!{1}\[\[(.*)\]\]"
-        content = re.sub(pattern, "", content)
-        return content
-
-    # def remove_tables(self, content: str) -> List[List[str]]:
-    #     """Convert markdown tables to nested lists."""
-    #     table_rows_pattern = r"((\r?\n){2}|^)([^\r\n]*\|[^\r\n]*(\r?\n)?)+(?=(\r?\n){2}|$)"
-    #     table_cells_pattern = r"([^\|\r\n]*)\|"
-    #
-    #     table_rows = re.findall(table_rows_pattern, content, re.MULTILINE)
-    #     table_lists = []
-    #     for row in table_rows:
-    #         cells = re.findall(table_cells_pattern, row[2])
-    #         cells = [cell.strip() for cell in cells if cell.strip()]
-    #         table_lists.append(cells)
-    #     return str(table_lists)
-
-    def remove_hyperlinks(self, content: str) -> str:
-        """Get a dictionary of a markdown file from its path."""
-        pattern = r"\[(.*?)\]\((.*?)\)"
-        content = re.sub(pattern, r"\1", content)
-        return content
-
-    def _init_parser(self) -> Dict:
-        """Initialize the parser with the config."""
-        return {}
-
-    def parse_tups(
-            self, filepath: Path, errors: str = "ignore"
-    ) -> List[Tuple[Optional[str], str]]:
-        """Parse file into tuples."""
-        with open(filepath, "r", encoding='utf8') as f:
-            try:
-                content = f.read()
-            except (Exception,) as e:
-                print(f'Error a file: "{filepath}"')
-                raise e
-        if self._remove_hyperlinks:
-            content = self.remove_hyperlinks(content)
-        if self._remove_images:
-            content = self.remove_images(content)
-        # if self._remove_tables:
-        #     content = self.remove_tables(content)
-        markdown_tups = self.markdown_to_tups(content)
-        return markdown_tups
-
-    def parse_file(
-            self, filepath: Path, errors: str = "ignore"
-    ) -> Union[str, List[str]]:
-        """Parse file into string."""
-        tups = self.parse_tups(filepath, errors=errors)
-        results = []
-        # TODO: don't include headers right now
-        for header, value in tups:
-            if header is None:
-                results.append(value)
-            else:
-                results.append(f"\n\n{header}\n{value}")
-        return results
--- a/scripts/parser/file/openapi3_parser.py
+++ b/scripts/parser/file/openapi3_parser.py
@@ -1,51 +0,0 @@
-from urllib.parse import urlparse
-
-from openapi_parser import parse
-
-try:
-    from scripts.parser.file.base_parser import BaseParser
-except ModuleNotFoundError:
-    from base_parser import BaseParser
-
-
-class OpenAPI3Parser(BaseParser):
-    def init_parser(self) -> None:
-        return super().init_parser()
-
-    def get_base_urls(self, urls):
-        base_urls = []
-        for i in urls:
-            parsed_url = urlparse(i)
-            base_url = parsed_url.scheme + "://" + parsed_url.netloc
-            if base_url not in base_urls:
-                base_urls.append(base_url)
-        return base_urls
-
-    def get_info_from_paths(self, path):
-        info = ""
-        if path.operations:
-            for operation in path.operations:
-                info += (
-                    f"\n{operation.method.value}="
-                    f"{operation.responses[0].description}"
-                )
-        return info
-
-    def parse_file(self, file_path):
-        data = parse(file_path)
-        results = ""
-        base_urls = self.get_base_urls(link.url for link in data.servers)
-        base_urls = ",".join([base_url for base_url in base_urls])
-        results += f"Base URL:{base_urls}\n"
-        i = 1
-        for path in data.paths:
-            info = self.get_info_from_paths(path)
-            results += (
-                f"Path{i}: {path.url}\n"
-                f"description: {path.description}\n"
-                f"parameters: {path.parameters}\nmethods: {info}\n"
-            )
-            i += 1
-        with open("results.txt", "w") as f:
-            f.write(results)
-        return results
--- a/scripts/parser/file/rst_parser.py
+++ b/scripts/parser/file/rst_parser.py
@@ -1,173 +0,0 @@
-"""reStructuredText parser.
-
-Contains parser for md files.
-
-"""
-import re
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from parser.file.base_parser import BaseParser
-
-
-class RstParser(BaseParser):
-    """reStructuredText parser.
-
-    Extract text from .rst files.
-    Returns dictionary with keys as headers and values as the text between headers.
-
-    """
-
-    def __init__(
-            self,
-            *args: Any,
-            remove_hyperlinks: bool = True,
-            remove_images: bool = True,
-            remove_table_excess: bool = True,
-            remove_interpreters: bool = True,
-            remove_directives: bool = True,
-            remove_whitespaces_excess: bool = True,
-            # Be careful with remove_characters_excess, might cause data loss
-            remove_characters_excess: bool = True,
-            **kwargs: Any,
-    ) -> None:
-        """Init params."""
-        super().__init__(*args, **kwargs)
-        self._remove_hyperlinks = remove_hyperlinks
-        self._remove_images = remove_images
-        self._remove_table_excess = remove_table_excess
-        self._remove_interpreters = remove_interpreters
-        self._remove_directives = remove_directives
-        self._remove_whitespaces_excess = remove_whitespaces_excess
-        self._remove_characters_excess = remove_characters_excess
-
-    def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
-        """Convert a reStructuredText file to a dictionary.
-
-        The keys are the headers and the values are the text under each header.
-
-        """
-        rst_tups: List[Tuple[Optional[str], str]] = []
-        lines = rst_text.split("\n")
-
-        current_header = None
-        current_text = ""
-
-        for i, line in enumerate(lines):
-            header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
-            if header_match and i > 0 and (
-                    len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
-                if current_header is not None:
-                    if current_text == "" or None:
-                        continue
-                    # removes the next heading from current Document
-                    if current_text.endswith(lines[i - 1] + "\n"):
-                        current_text = current_text[:len(current_text) - len(lines[i - 1] + "\n")]
-                    rst_tups.append((current_header, current_text))
-
-                current_header = lines[i - 1]
-                current_text = ""
-            else:
-                current_text += line + "\n"
-
-        rst_tups.append((current_header, current_text))
-
-        # TODO: Format for rst
-        #
-        # if current_header is not None:
-        #     # pass linting, assert keys are defined
-        #     rst_tups = [
-        #         (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
-        #         for key, value in rst_tups
-        #     ]
-        # else:
-        #     rst_tups = [
-        #         (key, re.sub("\n", "", value)) for key, value in rst_tups
-        #     ]
-
-        if current_header is None:
-            rst_tups = [
-                (key, re.sub("\n", "", value)) for key, value in rst_tups
-            ]
-        return rst_tups
-
-    def remove_images(self, content: str) -> str:
-        pattern = r"\.\. image:: (.*)"
-        content = re.sub(pattern, "", content)
-        return content
-
-    def remove_hyperlinks(self, content: str) -> str:
-        pattern = r"`(.*?) <(.*?)>`_"
-        content = re.sub(pattern, r"\1", content)
-        return content
-
-    def remove_directives(self, content: str) -> str:
-        """Removes reStructuredText Directives"""
-        pattern = r"`\.\.([^:]+)::"
-        content = re.sub(pattern, "", content)
-        return content
-
-    def remove_interpreters(self, content: str) -> str:
-        """Removes reStructuredText Interpreted Text Roles"""
-        pattern = r":(\w+):"
-        content = re.sub(pattern, "", content)
-        return content
-
-    def remove_table_excess(self, content: str) -> str:
-        """Pattern to remove grid table separators"""
-        pattern = r"^\+[-]+\+[-]+\+$"
-        content = re.sub(pattern, "", content, flags=re.MULTILINE)
-        return content
-
-    def remove_whitespaces_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]:
-        """Pattern to match 2 or more consecutive whitespaces"""
-        pattern = r"\s{2,}"
-        content = [(key, re.sub(pattern, "  ", value)) for key, value in content]
-        return content
-
-    def remove_characters_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]:
-        """Pattern to match 2 or more consecutive characters"""
-        pattern = r"(\S)\1{2,}"
-        content = [(key, re.sub(pattern, r"\1\1\1", value, flags=re.MULTILINE)) for key, value in content]
-        return content
-
-    def _init_parser(self) -> Dict:
-        """Initialize the parser with the config."""
-        return {}
-
-    def parse_tups(
-            self, filepath: Path, errors: str = "ignore"
-    ) -> List[Tuple[Optional[str], str]]:
-        """Parse file into tuples."""
-        with open(filepath, "r") as f:
-            content = f.read()
-        if self._remove_hyperlinks:
-            content = self.remove_hyperlinks(content)
-        if self._remove_images:
-            content = self.remove_images(content)
-        if self._remove_table_excess:
-            content = self.remove_table_excess(content)
-        if self._remove_directives:
-            content = self.remove_directives(content)
-        if self._remove_interpreters:
-            content = self.remove_interpreters(content)
-        rst_tups = self.rst_to_tups(content)
-        if self._remove_whitespaces_excess:
-            rst_tups = self.remove_whitespaces_excess(rst_tups)
-        if self._remove_characters_excess:
-            rst_tups = self.remove_characters_excess(rst_tups)
-        return rst_tups
-
-    def parse_file(
-            self, filepath: Path, errors: str = "ignore"
-    ) -> Union[str, List[str]]:
-        """Parse file into string."""
-        tups = self.parse_tups(filepath, errors=errors)
-        results = []
-        # TODO: don't include headers right now
-        for header, value in tups:
-            if header is None:
-                results.append(value)
-            else:
-                results.append(f"\n\n{header}\n{value}")
-        return results
--- a/scripts/parser/file/tabular_parser.py
+++ b/scripts/parser/file/tabular_parser.py
@@ -1,115 +0,0 @@
-"""Tabular parser.
-
-Contains parsers for tabular data files.
-
-"""
-from pathlib import Path
-from typing import Any, Dict, List, Union
-
-from parser.file.base_parser import BaseParser
-
-
-class CSVParser(BaseParser):
-    """CSV parser.
-
-    Args:
-        concat_rows (bool): whether to concatenate all rows into one document.
-            If set to False, a Document will be created for each row.
-            True by default.
-
-    """
-
-    def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None:
-        """Init params."""
-        super().__init__(*args, **kwargs)
-        self._concat_rows = concat_rows
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
-        """Parse file.
-
-        Returns:
-            Union[str, List[str]]: a string or a List of strings.
-
-        """
-        try:
-            import csv
-        except ImportError:
-            raise ValueError("csv module is required to read CSV files.")
-        text_list = []
-        with open(file, "r") as fp:
-            csv_reader = csv.reader(fp)
-            for row in csv_reader:
-                text_list.append(", ".join(row))
-        if self._concat_rows:
-            return "\n".join(text_list)
-        else:
-            return text_list
-
-
-class PandasCSVParser(BaseParser):
-    r"""Pandas-based CSV parser.
-
-    Parses CSVs using the separator detection from Pandas `read_csv`function.
-    If special parameters are required, use the `pandas_config` dict.
-
-    Args:
-        concat_rows (bool): whether to concatenate all rows into one document.
-            If set to False, a Document will be created for each row.
-            True by default.
-
-        col_joiner (str): Separator to use for joining cols per row.
-            Set to ", " by default.
-
-        row_joiner (str): Separator to use for joining each row.
-            Only used when `concat_rows=True`.
-            Set to "\n" by default.
-
-        pandas_config (dict): Options for the `pandas.read_csv` function call.
-            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
-            for more information.
-            Set to empty dict by default, this means pandas will try to figure
-            out the separators, table head, etc. on its own.
-
-    """
-
-    def __init__(
-            self,
-            *args: Any,
-            concat_rows: bool = True,
-            col_joiner: str = ", ",
-            row_joiner: str = "\n",
-            pandas_config: dict = {},
-            **kwargs: Any
-    ) -> None:
-        """Init params."""
-        super().__init__(*args, **kwargs)
-        self._concat_rows = concat_rows
-        self._col_joiner = col_joiner
-        self._row_joiner = row_joiner
-        self._pandas_config = pandas_config
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
-        """Parse file."""
-        try:
-            import pandas as pd
-        except ImportError:
-            raise ValueError("pandas module is required to read CSV files.")
-
-        df = pd.read_csv(file, **self._pandas_config)
-
-        text_list = df.apply(
-            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
-        ).tolist()
-
-        if self._concat_rows:
-            return (self._row_joiner).join(text_list)
-        else:
-            return text_list
--- a/scripts/parser/java2doc.py
+++ b/scripts/parser/java2doc.py
@@ -1,66 +0,0 @@
-import os
-
-import javalang
-
-
-def find_files(directory):
-    files_list = []
-    for root, dirs, files in os.walk(directory):
-        for file in files:
-            if file.endswith('.java'):
-                files_list.append(os.path.join(root, file))
-    return files_list
-
-
-def extract_functions(file_path):
-    with open(file_path, "r") as file:
-        java_code = file.read()
-        methods = {}
-        tree = javalang.parse.parse(java_code)
-        for _, node in tree.filter(javalang.tree.MethodDeclaration):
-            method_name = node.name
-            start_line = node.position.line - 1
-            end_line = start_line
-            brace_count = 0
-            for line in java_code.splitlines()[start_line:]:
-                end_line += 1
-                brace_count += line.count("{") - line.count("}")
-                if brace_count == 0:
-                    break
-            method_source_code = "\n".join(java_code.splitlines()[start_line:end_line])
-            methods[method_name] = method_source_code
-    return methods
-
-
-def extract_classes(file_path):
-    with open(file_path, 'r') as file:
-        source_code = file.read()
-        classes = {}
-        tree = javalang.parse.parse(source_code)
-        for class_decl in tree.types:
-            class_name = class_decl.name
-            declarations = []
-            methods = []
-            for field_decl in class_decl.fields:
-                field_name = field_decl.declarators[0].name
-                field_type = field_decl.type.name
-                declarations.append(f"{field_type} {field_name}")
-            for method_decl in class_decl.methods:
-                methods.append(method_decl.name)
-            class_string = "Declarations: " + ", ".join(declarations) + "\n  Method name: " + ", ".join(methods)
-            classes[class_name] = class_string
-    return classes
-
-
-def extract_functions_and_classes(directory):
-    files = find_files(directory)
-    functions_dict = {}
-    classes_dict = {}
-    for file in files:
-        functions = extract_functions(file)
-        if functions:
-            functions_dict[file] = functions
-        classes = extract_classes(file)
-        if classes:
-            classes_dict[file] = classes
-    return functions_dict, classes_dict
--- a/scripts/parser/js2doc.py
+++ b/scripts/parser/js2doc.py
@@ -1,70 +0,0 @@
-import os
-
-import escodegen
-import esprima
-
-
-def find_files(directory):
-    files_list = []
-    for root, dirs, files in os.walk(directory):
-        for file in files:
-            if file.endswith('.js'):
-                files_list.append(os.path.join(root, file))
-    return files_list
-
-
-def extract_functions(file_path):
-    with open(file_path, 'r') as file:
-        source_code = file.read()
-        functions = {}
-        tree = esprima.parseScript(source_code)
-        for node in tree.body:
-            if node.type == 'FunctionDeclaration':
-                func_name = node.id.name if node.id else '<anonymous>'
-                functions[func_name] = escodegen.generate(node)
-            elif node.type == 'VariableDeclaration':
-                for declaration in node.declarations:
-                    if declaration.init and declaration.init.type == 'FunctionExpression':
-                        func_name = declaration.id.name if declaration.id else '<anonymous>'
-                        functions[func_name] = escodegen.generate(declaration.init)
-            elif node.type == 'ClassDeclaration':
-                for subnode in node.body.body:
-                    if subnode.type == 'MethodDefinition':
-                        func_name = subnode.key.name
-                        functions[func_name] = escodegen.generate(subnode.value)
-                    elif subnode.type == 'VariableDeclaration':
-                        for declaration in subnode.declarations:
-                            if declaration.init and declaration.init.type == 'FunctionExpression':
-                                func_name = declaration.id.name if declaration.id else '<anonymous>'
-                                functions[func_name] = escodegen.generate(declaration.init)
-        return functions
-
-
-def extract_classes(file_path):
-    with open(file_path, 'r') as file:
-        source_code = file.read()
-        classes = {}
-        tree = esprima.parseScript(source_code)
-        for node in tree.body:
-            if node.type == 'ClassDeclaration':
-                class_name = node.id.name
-                function_names = []
-                for subnode in node.body.body:
-                    if subnode.type == 'MethodDefinition':
-                        function_names.append(subnode.key.name)
-                classes[class_name] = ", ".join(function_names)
-    return classes
-
-
-def extract_functions_and_classes(directory):
-    files = find_files(directory)
-    functions_dict = {}
-    classes_dict = {}
-    for file in files:
-        functions = extract_functions(file)
-        if functions:
-            functions_dict[file] = functions
-        classes = extract_classes(file)
-        if classes:
-            classes_dict[file] = classes
-    return functions_dict, classes_dict
--- a/scripts/parser/open_ai_func.py
+++ b/scripts/parser/open_ai_func.py
@@ -1,100 +0,0 @@
-import os
-
-import tiktoken
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.vectorstores import FAISS
-from retry import retry
-
-
-# from langchain.embeddings import HuggingFaceEmbeddings
-# from langchain.embeddings import HuggingFaceInstructEmbeddings
-# from langchain.embeddings import CohereEmbeddings
-
-
-def num_tokens_from_string(string: str, encoding_name: str) -> tuple[int, float]:
-    # Function to convert string to tokens and estimate user cost.
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    total_price = (num_tokens / 1000) * 0.0004
-    return num_tokens, total_price
-
-
-@retry(tries=10, delay=60)
-def store_add_texts_with_retry(store, i):
-    store.add_texts([i.page_content], metadatas=[i.metadata])
-    # store_pine.add_texts([i.page_content], metadatas=[i.metadata])
-
-
-def call_openai_api(docs, folder_name):
-    # Function to create a vector store from the documents and save it to disk.
-
-    # create output folder if it doesn't exist
-    if not os.path.exists(f"outputs/{folder_name}"):
-        os.makedirs(f"outputs/{folder_name}")
-
-    from tqdm import tqdm
-
-    docs_test = [docs[0]]
-    # remove the first element from docs
-    docs.pop(0)
-    # cut first n docs if you want to restart
-    # docs = docs[:n]
-    c1 = 0
-    # pinecone.init(
-    #     api_key="",  # find at app.pinecone.io
-    #     environment="us-east1-gcp"  # next to api key in console
-    # )
-    # index_name = "pandas"
-    if (  # azure
-        os.environ.get("OPENAI_API_BASE")
-        and os.environ.get("OPENAI_API_VERSION")
-        and os.environ.get("AZURE_DEPLOYMENT_NAME")
-        and os.environ.get("AZURE_EMBEDDINGS_DEPLOYMENT_NAME")
-    ):
-        os.environ["OPENAI_API_TYPE"] = "azure"
-        openai_embeddings = OpenAIEmbeddings(model=os.environ.get("AZURE_EMBEDDINGS_DEPLOYMENT_NAME"))
-    else:
-        openai_embeddings = OpenAIEmbeddings()
-    store = FAISS.from_documents(docs_test, openai_embeddings)
-    # store_pine = Pinecone.from_documents(docs_test, OpenAIEmbeddings(), index_name=index_name)
-
-    # Uncomment for MPNet embeddings
-    # model_name = "sentence-transformers/all-mpnet-base-v2"
-    # hf = HuggingFaceEmbeddings(model_name=model_name)
-    # store = FAISS.from_documents(docs_test, hf)
-    for i in tqdm(
-        docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format="{l_bar}{bar}| Time Left: {remaining}"
-    ):
-        try:
-            store_add_texts_with_retry(store, i)
-        except Exception as e:
-            print(e)
-            print("Error on ", i)
-            print("Saving progress")
-            print(f"stopped at {c1} out of {len(docs)}")
-            store.save_local(f"outputs/{folder_name}")
-            break
-        c1 += 1
-    store.save_local(f"outputs/{folder_name}")
-
-
-def get_user_permission(docs, folder_name):
-    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
-    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
-    # docs_content = (" ".join(docs))
-    docs_content = ""
-    for doc in docs:
-        docs_content += doc.page_content
-
-    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
-    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
-    print(f"Number of Tokens = {format(tokens, ',d')}")
-    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    # Here we check for user permission before calling the API.
-    user_input = input("Price Okay? (Y/N) \n").lower()
-    if user_input == "y":
-        call_openai_api(docs, folder_name)
-    elif user_input == "":
-        call_openai_api(docs, folder_name)
-    else:
-        print("The API was not called. No money was spent.")
--- a/scripts/parser/py2doc.py
+++ b/scripts/parser/py2doc.py
@@ -1,121 +0,0 @@
-import ast
-import os
-from pathlib import Path
-
-import tiktoken
-from langchain_community.llms import OpenAI
-from langchain.prompts import PromptTemplate
-
-
-def find_files(directory):
-    files_list = []
-    for root, dirs, files in os.walk(directory):
-        for file in files:
-            if file.endswith('.py'):
-                files_list.append(os.path.join(root, file))
-    return files_list
-
-
-def extract_functions(file_path):
-    with open(file_path, 'r') as file:
-        source_code = file.read()
-        functions = {}
-        tree = ast.parse(source_code)
-        for node in ast.walk(tree):
-            if isinstance(node, ast.FunctionDef):
-                func_name = node.name
-                func_def = ast.get_source_segment(source_code, node)
-                functions[func_name] = func_def
-    return functions
-
-
-def extract_classes(file_path):
-    with open(file_path, 'r') as file:
-        source_code = file.read()
-        classes = {}
-        tree = ast.parse(source_code)
-        for node in ast.walk(tree):
-            if isinstance(node, ast.ClassDef):
-                class_name = node.name
-                function_names = []
-                for subnode in ast.walk(node):
-                    if isinstance(subnode, ast.FunctionDef):
-                        function_names.append(subnode.name)
-                classes[class_name] = ", ".join(function_names)
-    return classes
-
-
-def extract_functions_and_classes(directory):
-    files = find_files(directory)
-    functions_dict = {}
-    classes_dict = {}
-    for file in files:
-        functions = extract_functions(file)
-        if functions:
-            functions_dict[file] = functions
-        classes = extract_classes(file)
-        if classes:
-            classes_dict[file] = classes
-    return functions_dict, classes_dict
-
-
-def parse_functions(functions_dict, formats, dir):
-    c1 = len(functions_dict)
-    for i, (source, functions) in enumerate(functions_dict.items(), start=1):
-        print(f"Processing file {i}/{c1}")
-        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
-        subfolders = "/".join(source_w.split("/")[:-1])
-        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
-        for j, (name, function) in enumerate(functions.items(), start=1):
-            print(f"Processing function {j}/{len(functions)}")
-            prompt = PromptTemplate(
-                input_variables=["code"],
-                template="Code: \n{code}, \nDocumentation: ",
-            )
-            llm = OpenAI(temperature=0)
-            response = llm(prompt.format(code=function))
-            mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
-            with open(f"outputs/{source_w}", mode) as f:
-                f.write(
-                    f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
-
-
-def parse_classes(classes_dict, formats, dir):
-    c1 = len(classes_dict)
-    for i, (source, classes) in enumerate(classes_dict.items()):
-        print(f"Processing file {i + 1}/{c1}")
-        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
-        subfolders = "/".join(source_w.split("/")[:-1])
-        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
-        for name, function_names in classes.items():
-            print(f"Processing Class {i + 1}/{c1}")
-            prompt = PromptTemplate(
-                input_variables=["class_name", "functions_names"],
-                template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
-            )
-            llm = OpenAI(temperature=0)
-            response = llm(prompt.format(class_name=name, functions_names=function_names))
-
-            with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
-                f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
-
-
-def transform_to_docs(functions_dict, classes_dict, formats, dir):
-    docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
-    docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
-
-    num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(docs_content))
-    total_price = ((num_tokens / 1000) * 0.02)
-
-    print(f"Number of Tokens = {num_tokens:,d}")
-    print(f"Approx Cost = ${total_price:,.2f}")
-
-    user_input = input("Price Okay? (Y/N)\n").lower()
-    if user_input == "y" or user_input == "":
-        if not Path("outputs").exists():
-            Path("outputs").mkdir()
-        parse_functions(functions_dict, formats, dir)
-        parse_classes(classes_dict, formats, dir)
-        print("All done!")
-    else:
-        print("The API was not called. No money was spent.")
--- a/scripts/parser/schema/init.py
+++ b/scripts/parser/schema/init.py
--- a/scripts/parser/schema/base.py
+++ b/scripts/parser/schema/base.py
@@ -1,34 +0,0 @@
-"""Base schema for readers."""
-from dataclasses import dataclass
-
-from langchain.docstore.document import Document as LCDocument
-from parser.schema.schema import BaseDocument
-
-
-@dataclass
-class Document(BaseDocument):
-    """Generic interface for a data document.
-
-    This document connects to data sources.
-
-    """
-
-    def __post_init__(self) -> None:
-        """Post init."""
-        if self.text is None:
-            raise ValueError("text field not set.")
-
-    @classmethod
-    def get_type(cls) -> str:
-        """Get Document type."""
-        return "Document"
-
-    def to_langchain_format(self) -> LCDocument:
-        """Convert struct to LangChain document format."""
-        metadata = self.extra_info or {}
-        return LCDocument(page_content=self.text, metadata=metadata)
-
-    @classmethod
-    def from_langchain_format(cls, doc: LCDocument) -> "Document":
-        """Convert struct from LangChain document format."""
-        return cls(text=doc.page_content, extra_info=doc.metadata)
--- a/scripts/parser/schema/schema.py
+++ b/scripts/parser/schema/schema.py
@@ -1,64 +0,0 @@
-"""Base schema for data structures."""
-from abc import abstractmethod
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
-
-from dataclasses_json import DataClassJsonMixin
-
-
-@dataclass
-class BaseDocument(DataClassJsonMixin):
-    """Base document.
-
-    Generic abstract interfaces that captures both index structs
-    as well as documents.
-
-    """
-
-    # TODO: consolidate fields from Document/IndexStruct into base class
-    text: Optional[str] = None
-    doc_id: Optional[str] = None
-    embedding: Optional[List[float]] = None
-
-    # extra fields
-    extra_info: Optional[Dict[str, Any]] = None
-
-    @classmethod
-    @abstractmethod
-    def get_type(cls) -> str:
-        """Get Document type."""
-
-    def get_text(self) -> str:
-        """Get text."""
-        if self.text is None:
-            raise ValueError("text field not set.")
-        return self.text
-
-    def get_doc_id(self) -> str:
-        """Get doc_id."""
-        if self.doc_id is None:
-            raise ValueError("doc_id not set.")
-        return self.doc_id
-
-    @property
-    def is_doc_id_none(self) -> bool:
-        """Check if doc_id is None."""
-        return self.doc_id is None
-
-    def get_embedding(self) -> List[float]:
-        """Get embedding.
-
-        Errors if embedding is None.
-
-        """
-        if self.embedding is None:
-            raise ValueError("embedding not set.")
-        return self.embedding
-
-    @property
-    def extra_info_str(self) -> Optional[str]:
-        """Extra info string."""
-        if self.extra_info is None:
-            return None
-
-        return "\n".join([f"{k}: {str(v)}" for k, v in self.extra_info.items()])
--- a/scripts/parser/token_func.py
+++ b/scripts/parser/token_func.py
@@ -1,76 +0,0 @@
-import re
-from math import ceil
-from typing import List
-
-import tiktoken
-from parser.schema.base import Document
-
-def separate_header_and_body(text):
-    header_pattern = r"^(.*?\n){3}"
-    match = re.match(header_pattern, text)
-    header = match.group(0)
-    body = text[len(header):]
-    return header, body
-
-
-def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
-    docs = []
-    current_group = None
-
-    for doc in documents:
-        doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
-
-        if current_group is None:
-            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
-                                     extra_info=doc.extra_info)
-        elif len(tiktoken.get_encoding("cl100k_base").encode(
-                current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
-            current_group.text += " " + doc.text
-        else:
-            docs.append(current_group)
-            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
-                                     extra_info=doc.extra_info)
-
-    if current_group is not None:
-        docs.append(current_group)
-
-    return docs
-
-
-def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
-    docs = []
-    for doc in documents:
-        token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
-        if token_length <= max_tokens:
-            docs.append(doc)
-        else:
-            header, body = separate_header_and_body(doc.text)
-            if len(tiktoken.get_encoding("cl100k_base").encode(header)) > max_tokens:
-                body = doc.text
-                header = ""
-            num_body_parts = ceil(token_length / max_tokens)
-            part_length = ceil(len(body) / num_body_parts)
-            body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)]
-            for i, body_part in enumerate(body_parts):
-                new_doc = Document(text=header + body_part.strip(),
-                                   doc_id=f"{doc.doc_id}-{i}",
-                                   embedding=doc.embedding,
-                                   extra_info=doc.extra_info)
-                docs.append(new_doc)
-    return docs
-
-
-def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
-    if not token_check:
-        return documents
-    print("Grouping small documents")
-    try:
-        documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
-    except Exception:
-        print("Grouping failed, try running without token_check")
-    print("Separating large documents")
-    try:
-        documents = split_documents(documents=documents, max_tokens=max_tokens)
-    except Exception:
-        print("Grouping failed, try running without token_check")
-    return documents