Merge branch 'arc53:main' into main

2026-03-06 22:03:39 +00:00 · 2025-02-06 04:09:08 +05:30
parent 1f1e710a6d 0913c43219
commit 0bddae5775
54 changed files with 115 additions and 1932 deletions
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.12-bookworm
+
+# Install Node.js 20.x
+RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install global npm packages
+RUN npm install -g husky vite
+
+# Create and activate Python virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+WORKDIR /workspace
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,18 @@
+{
+	"name": "DocsGPT Dev Container",
+	"dockerComposeFile": ["../docker-compose-dev.yaml", "docker-compose.override.yaml"],
+	"service": "dev",
+	"workspaceFolder": "/workspace",
+	"postCreateCommand": ".devcontainer/post-create-command.sh",
+	"forwardPorts": [7091, 5173, 6379, 27017],
+	"customizations": {
+	  "vscode": {
+		"extensions": [
+		  "ms-python.python",
+		  "ms-toolsai.jupyter",
+		  "esbenp.prettier-vscode",
+		  "dbaeumer.vscode-eslint"
+		]
+	  }
+	}
+  }
--- a/.devcontainer/docker-compose.override.yaml
+++ b/.devcontainer/docker-compose.override.yaml
@@ -0,0 +1,40 @@
+version: '3.8'
+
+services:
+  dev:
+    build:
+      context: .
+      dockerfile: .devcontainer/Dockerfile
+    volumes:
+      - .:/workspace:cached
+    command: sleep infinity
+    depends_on:
+      redis:
+        condition: service_healthy
+      mongo:
+        condition: service_healthy
+    environment:
+      - CELERY_BROKER_URL=redis://redis:6379/0
+      - CELERY_RESULT_BACKEND=redis://redis:6379/1
+      - MONGO_URI=mongodb://mongo:27017/docsgpt
+      - CACHE_REDIS_URL=redis://redis:6379/2
+    networks:
+      - default
+
+  redis:
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 30s
+      retries: 5
+
+  mongo:
+    healthcheck:
+      test: ["CMD", "mongosh", "--eval", "db.adminCommand('ping')"]
+      interval: 5s
+      timeout: 30s
+      retries: 5
+
+networks:
+  default:
+    name: docsgpt-dev-network
--- a/.devcontainer/post-create-command.sh
+++ b/.devcontainer/post-create-command.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e  # Exit immediately if a command exits with a non-zero status
+
+cp -n .env-template .env || true
+mkdir -p model
+if [ ! -d model/all-mpnet-base-v2 ]; then
+    wget -q https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.zip -O model/mpnet-base-v2.zip
+    unzip -q model/mpnet-base-v2.zip -d model
+    rm model/mpnet-base-v2.zip
+fi
+pip install -r application/requirements.txt
+cd frontend
+npm install --include=dev
--- a/README.md
+++ b/README.md
@@ -102,8 +102,8 @@ On windows:
 3. Run the following command:

   ```bash
-   docker compose up --build
-   ```
+  docker compose -f deployment/docker-compose.yaml up --build
+  ```
 4. Navigate to http://localhost:5173/.

 To stop, just run `Ctrl + C`.
--- a/application/Dockerfile
+++ b/application/Dockerfile
@@ -20,7 +20,7 @@ RUN if [ -f /usr/bin/python3.12 ]; then \

 # Download and unzip the model
 RUN wget https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.zip && \
-    unzip mpnet-base-v2.zip -d model && \
+    unzip mpnet-base-v2.zip -d models && \
    rm mpnet-base-v2.zip

 # Install Rust
@@ -63,7 +63,8 @@ RUN groupadd -r appuser && \

 # Copy the virtual environment and model from the builder stage
 COPY --from=builder /venv /venv
-COPY --from=builder /model /app/model
+
+COPY --from=builder /models /app/models

 # Copy your application code
 COPY . /app/application
--- a/application/vectorstore/base.py
+++ b/application/vectorstore/base.py
@@ -75,9 +75,9 @@ class BaseVectorStore(ABC):
                    openai_api_key=embeddings_key
                )
        elif embeddings_name == "huggingface_sentence-transformers/all-mpnet-base-v2":
-            if os.path.exists("./model/all-mpnet-base-v2"):
+            if os.path.exists("./models/all-mpnet-base-v2"):
                embedding_instance = EmbeddingsSingleton.get_instance(
-                    embeddings_name="./model/all-mpnet-base-v2",
+                    embeddings_name = "./models/all-mpnet-base-v2",
                )
            else:
                embedding_instance = EmbeddingsSingleton.get_instance(
@@ -86,4 +86,5 @@ class BaseVectorStore(ABC):
        else:
            embedding_instance = EmbeddingsSingleton.get_instance(embeddings_name)

-        return embedding_instance
+        return embedding_instance
+    
--- a/deployment/docker-compose-azure.yaml
+++ b/deployment/docker-compose-azure.yaml
--- a/deployment/docker-compose-dev.yaml
+++ b/deployment/docker-compose-dev.yaml
--- a/deployment/docker-compose-local.yaml
+++ b/deployment/docker-compose-local.yaml
--- a/deployment/docker-compose.yaml
+++ b/deployment/docker-compose.yaml
--- a/deployment/k8s/deployments/docsgpt-deploy.yaml
+++ b/deployment/k8s/deployments/docsgpt-deploy.yaml
--- a/deployment/k8s/deployments/mongo-deploy.yaml
+++ b/deployment/k8s/deployments/mongo-deploy.yaml
--- a/deployment/k8s/deployments/qdrant-deploy.yaml
+++ b/deployment/k8s/deployments/qdrant-deploy.yaml
--- a/deployment/k8s/deployments/redis-deploy.yaml
+++ b/deployment/k8s/deployments/redis-deploy.yaml
--- a/deployment/k8s/docsgpt-secrets.yaml
+++ b/deployment/k8s/docsgpt-secrets.yaml
--- a/deployment/k8s/services/docsgpt-service.yaml
+++ b/deployment/k8s/services/docsgpt-service.yaml
--- a/deployment/k8s/services/mongo-service.yaml
+++ b/deployment/k8s/services/mongo-service.yaml
--- a/deployment/k8s/services/qdrant-service.yaml
+++ b/deployment/k8s/services/qdrant-service.yaml
--- a/deployment/k8s/services/redis-service.yaml
+++ b/deployment/k8s/services/redis-service.yaml
--- a/docker-compose-mock.yaml
+++ b/docker-compose-mock.yaml
@@ -1,20 +0,0 @@
-services:
-  frontend:
-    build: ./frontend
-    environment:
-      - VITE_API_HOST=http://localhost:7091
-      - VITE_API_STREAMING=$VITE_API_STREAMING
-    ports:
-      - "5173:5173"
-    depends_on:
-      - mock-backend
-
-  mock-backend:
-    build: ./mock-backend
-    ports:
-      - "7091:7091"
-
-  redis:
-    image: redis:6-alpine
-    ports:
-      - 6379:6379
--- a/docs/pages/Deploying/Development-Environment.md
+++ b/docs/pages/Deploying/Development-Environment.md
@@ -2,14 +2,14 @@

 ### Spin up Mongo and Redis

-For development, only two containers are used from [docker-compose.yaml](https://github.com/arc53/DocsGPT/blob/main/docker-compose.yaml) (by deleting all services except for Redis and Mongo).
-See file [docker-compose-dev.yaml](https://github.com/arc53/DocsGPT/blob/main/docker-compose-dev.yaml).
+For development, only two containers are used from [docker-compose.yaml](https://github.com/arc53/DocsGPT/blob/main/deployment/docker-compose.yaml) (by deleting all services except for Redis and Mongo).
+See file [docker-compose-dev.yaml](https://github.com/arc53/DocsGPT/blob/main/deployment/docker-compose-dev.yaml).

 Run

 ```
-docker compose -f docker-compose-dev.yaml build
-docker compose -f docker-compose-dev.yaml up -d
+docker compose -f deployment/docker-compose-dev.yaml build
+docker compose -f deployment/docker-compose-dev.yaml up -d
 ```

 ### Run the Backend
--- a/docs/pages/Deploying/Hosting-the-app.md
+++ b/docs/pages/Deploying/Hosting-the-app.md
@@ -73,7 +73,7 @@ To save the file, press CTRL+X, then Y, and then ENTER.

 Next, set the correct IP for the Backend by opening the docker-compose.yml file:

-`nano docker-compose.yml`
+`nano deployment/docker-compose.yaml`

 And Change line 7 to: `VITE_API_HOST=http://localhost:7091`
 to this `VITE_API_HOST=http://<your instance public IP>:7091`
@@ -84,7 +84,7 @@ This will allow the frontend to connect to the backend.

 You're almost there! Now that all the necessary bits and pieces have been installed, it is time to run the application. To do so, use the following command:

-`sudo docker-compose up -d`
+`sudo docker compose -f deployment/docker-compose.yaml up -d`

 Launching it for the first time will take a few minutes to download all the necessary dependencies and build.

--- a/docs/pages/Deploying/Kubernetes-Deploying.md
+++ b/docs/pages/Deploying/Kubernetes-Deploying.md
@@ -11,7 +11,7 @@ Ensure you have the following installed before proceeding:

 ## Folder Structure

-The `k8s` folder contains the necessary deployment and service configuration files:
+The `deployment/k8s` folder contains the necessary deployment and service configuration files:

 - `deployments/`
 - `services/`
@@ -23,7 +23,7 @@ The `k8s` folder contains the necessary deployment and service configuration fil

   ```sh
   git clone https://github.com/arc53/DocsGPT.git
-   cd docsgpt/k8s
+   cd docsgpt/deployment/k8s
   ```

 2. **Configure Secrets (optional)**
--- a/docs/pages/Deploying/Quickstart.md
+++ b/docs/pages/Deploying/Quickstart.md
@@ -29,7 +29,7 @@ If you prefer to follow manual steps, refer to this guide:
   
 3. Run the following commands:
   ```bash
-   docker compose up
+   docker compose -f deployment/docker-compose.yaml up
   ```
 4. Navigate to http://localhost:5173/.

@@ -56,7 +56,7 @@ To stop, simply press **Ctrl + C**.
 3. Run the following command:

   ```bash
-   docker-compose up
+   docker compose -f deployment/docker-compose.yaml up
   ```
 4. Navigate to http://localhost:5173/.
 5. To stop the setup, just press **Ctrl + C** in the WSL terminal
--- a/docs/pages/Deploying/Railway-Deploying.md
+++ b/docs/pages/Deploying/Railway-Deploying.md
@@ -97,11 +97,11 @@ To save the file, press CTRL+X, then Y, and then ENTER.

  

-Next, set the correct IP for the Backend by opening the docker-compose.yml file:
+Next, set the correct IP for the Backend by opening the docker-compose.yaml file:

  

-`nano docker-compose.yml`
+`nano deployment/docker-compose.yaml`

  

@@ -123,7 +123,7 @@ You're almost there! Now that all the necessary bits and pieces have been instal

  

-`sudo docker-compose up -d`
+`sudo docker compose -f deployment/docker-compose.yaml up -d`

  

--- a/run-with-docker-compose.sh
+++ b/run-with-docker-compose.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-source .env
-
-if [[ -n "$OPENAI_API_BASE" ]] && [[ -n "$OPENAI_API_VERSION" ]] && [[ -n "$AZURE_DEPLOYMENT_NAME" ]] && [[ -n "$AZURE_EMBEDDINGS_DEPLOYMENT_NAME" ]]; then
-  echo "Running Azure Configuration"
-  docker compose -f docker-compose-azure.yaml up --build
-else
-  echo "Running Plain Configuration"
-  docker compose up --build
-fi
--- a/scripts/init.py
+++ b/scripts/init.py
--- a/scripts/code_docs_gen.py
+++ b/scripts/code_docs_gen.py
@@ -1,95 +0,0 @@
-import ast
-import json
-from pathlib import Path
-
-import dotenv
-from langchain_community.llms import OpenAI
-from langchain.prompts import PromptTemplate
-
-dotenv.load_dotenv()
-
-ps = list(Path("inputs").glob("**/*.py"))
-data = []
-sources = []
-for p in ps:
-    with open(p) as f:
-        data.append(f.read())
-    sources.append(p)
-
-
-def get_functions_in_class(node):
-    functions = []
-    functions_code = []
-    for child in node.body:
-        if isinstance(child, ast.FunctionDef):
-            functions.append(child.name)
-            functions_code.append(ast.unparse(child))
-
-    return functions, functions_code
-
-
-def get_classes_and_functions(source_code):
-    tree = ast.parse(source_code)
-    classes = {}
-    for node in tree.body:
-        if isinstance(node, ast.ClassDef):
-            class_name = node.name
-            function_name, function = get_functions_in_class(node)
-            # join function name and function code
-            functions = dict(zip(function_name, function))
-            classes[class_name] = functions
-    return classes
-
-
-structure_dict = {}
-c1 = 0
-for code in data:
-    classes = get_classes_and_functions(ast.parse(code))
-    source = str(sources[c1])
-    structure_dict[source] = classes
-    c1 += 1
-
-# save the structure dict as json
-with open('structure_dict.json', 'w') as f:
-    json.dump(structure_dict, f)
-
-if not Path("outputs").exists():
-    Path("outputs").mkdir()
-
-c1 = len(structure_dict)
-c2 = 0
-for source, classes in structure_dict.items():
-    c2 += 1
-    print(f"Processing file {c2}/{c1}")
-    f1 = len(classes)
-    f2 = 0
-    for class_name, functions in classes.items():
-        f2 += 1
-        print(f"Processing class {f2}/{f1}")
-        source_w = source.replace("inputs/", "")
-        source_w = source_w.replace(".py", ".txt")
-        if not Path(f"outputs/{source_w}").exists():
-            with open(f"outputs/{source_w}", "w") as f:
-                f.write(f"Class: {class_name}")
-        else:
-            with open(f"outputs/{source_w}", "a") as f:
-                f.write(f"\n\nClass: {class_name}")
-        # append class name to the front
-        for function in functions:
-            b1 = len(functions)
-            b2 = 0
-            print(f"Processing function {b2}/{b1}")
-            b2 += 1
-            prompt = PromptTemplate(
-                input_variables=["code"],
-                template="Code: \n{code}, \nDocumentation: ",
-            )
-            llm = OpenAI(temperature=0)
-            response = llm(prompt.format(code=functions[function]))
-
-            if not Path(f"outputs/{source_w}").exists():
-                with open(f"outputs/{source_w}", "w") as f:
-                    f.write(f"Function: {functions[function]}, \nDocumentation: {response}")
-            else:
-                with open(f"outputs/{source_w}", "a") as f:
-                    f.write(f"\n\nFunction: {functions[function]}, \nDocumentation: {response}")
--- a/scripts/ingest.py
+++ b/scripts/ingest.py
@@ -1,128 +0,0 @@
-import os
-import sys
-from collections import defaultdict
-from typing import List, Optional
-
-import dotenv
-import nltk
-import typer
-
-from parser.file.bulk import SimpleDirectoryReader
-from parser.java2doc import extract_functions_and_classes as extract_java
-from parser.js2doc import extract_functions_and_classes as extract_js
-from parser.open_ai_func import call_openai_api, get_user_permission
-from parser.py2doc import extract_functions_and_classes as extract_py
-from parser.py2doc import transform_to_docs
-from parser.schema.base import Document
-from parser.token_func import group_split
-
-dotenv.load_dotenv()
-
-app = typer.Typer(add_completion=False)
-
-nltk.download('punkt', quiet=True)
-nltk.download('averaged_perceptron_tagger', quiet=True)
-
-
-def metadata_from_filename(title):
-    return {'title': title}
-
-# Splits all files in specified folder to documents
-@app.command()
-def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
-                                    help="Whether to skip price confirmation"),
-           dir: Optional[List[str]] = typer.Option(["inputs"],
-                                                   help="""List of paths to directory for index creation.
-                                                        E.g. --dir inputs --dir inputs2"""),
-           file: Optional[List[str]] = typer.Option(None,
-                                                    help="""File paths to use (Optional; overrides dir).
-                                                        E.g. --file inputs/1.md --file inputs/2.md"""),
-           recursive: Optional[bool] = typer.Option(True, help="Whether to recursively search in subdirectories."),
-           limit: Optional[int] = typer.Option(None, help="Maximum number of files to read."),
-           formats: Optional[List[str]] = typer.Option([".rst", ".md"],
-                                                       help="""List of required extensions (list with .)
-                                                        Currently supported: 
-                                                        .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
-           exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."),
-           sample: Optional[bool] = typer.Option(False,
-                                                 help="Whether to output sample of the first 5 split documents."),
-           token_check: Optional[bool] = typer.Option(True, help="Whether to group small documents and split large."),
-           min_tokens: Optional[int] = typer.Option(150, help="Minimum number of tokens to not group."),
-           max_tokens: Optional[int] = typer.Option(2000, help="Maximum number of tokens to not split."),
-           ):
-    """
-        Creates index from specified location or files.
-        By default /inputs folder is used, .rst and .md are parsed.
-    """
-
-    def process_one_docs(directory, folder_name):
-        raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive,
-                                         required_exts=formats, num_files_limit=limit,
-                                         exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
-
-        # Here we split the documents, as needed, into smaller chunks.
-        # We do this due to the context limits of the LLMs.
-        raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens,
-                               token_check=token_check)
-        # Old method
-        # text_splitter = RecursiveCharacterTextSplitter()
-        # docs = text_splitter.split_documents(raw_docs)
-
-        # Sample feature
-        if sample:
-            for i in range(min(5, len(raw_docs))):
-                print(raw_docs[i].text)
-
-        docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
-
-        # Here we check for command line arguments for bot calls.
-        # If no argument exists or the yes is not True, then the
-        # user permission is requested to call the API.
-        if len(sys.argv) > 1 and yes:
-            call_openai_api(docs, folder_name)
-        else:
-            get_user_permission(docs, folder_name)
-
-
-    folder_counts = defaultdict(int)
-    folder_names = []
-    for dir_path in dir:
-        folder_name = os.path.basename(os.path.normpath(dir_path))
-        folder_counts[folder_name] += 1
-        if folder_counts[folder_name] > 1:
-            folder_name = f"{folder_name}_{folder_counts[folder_name]}"
-        folder_names.append(folder_name)
-
-    for directory, folder_name in zip(dir, folder_names):
-        process_one_docs(directory, folder_name)
-
-
-@app.command()
-def convert(dir: Optional[str] = typer.Option("inputs",
-                                              help="""Path to directory to make documentation for.
-                                                        E.g. --dir inputs """),
-            formats: Optional[str] = typer.Option("py",
-                                                  help="""Required language. 
-                                                        py, js, java supported for now""")):
-    """
-            Creates documentation linked to original functions from specified location.
-            By default /inputs folder is used, .py is parsed.
-    """
-    # Using a dictionary to map between the formats and their respective extraction functions
-    # makes the code more scalable. When adding more formats in the future, 
-    # you only need to update the extraction_functions dictionary.
-    extraction_functions = {
-    'py': extract_py,
-    'js': extract_js,
-    'java': extract_java
-    }
-
-    if formats in extraction_functions:
-        functions_dict, classes_dict = extraction_functions[formats](dir)
-    else:
-        raise Exception("Sorry, language not supported yet")                                   
-    transform_to_docs(functions_dict, classes_dict, formats, dir)
-
-
-if __name__ == "__main__":
-    app()
--- a/scripts/old/init.py
+++ b/scripts/old/init.py
--- a/scripts/old/ingest_rst.py
+++ b/scripts/old/ingest_rst.py
@@ -1,90 +0,0 @@
-import pickle
-import sys
-from argparse import ArgumentParser
-from pathlib import Path
-
-import dotenv
-import faiss
-import tiktoken
-from langchain_openai import OpenAIEmbeddings
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.vectorstores import FAISS
-
-
-def num_tokens_from_string(string: str, encoding_name: str) -> int:
-    # Function to convert string to tokens and estimate user cost.
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    total_price = ((num_tokens / 1000) * 0.0004)
-    return num_tokens, total_price
-
-
-def call_openai_api():
-    # Function to create a vector store from the documents and save it to disk.
-    store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
-    faiss.write_index(store.index, "docs.index")
-    store.index = None
-    with open("faiss_store.pkl", "wb") as f:
-        pickle.dump(store, f)
-
-
-def get_user_permission():
-    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
-    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
-    docs_content = (" ".join(docs))
-    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
-    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
-    print(f"Number of Tokens = {format(tokens, ',d')}")
-    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    # Here we check for user permission before calling the API.
-    user_input = input("Price Okay? (Y/N) \n").lower()
-    if user_input == "y":
-        call_openai_api()
-    elif user_input == "":
-        call_openai_api()
-    else:
-        print("The API was not called. No money was spent.")
-
-
-# Load .env file
-dotenv.load_dotenv()
-
-ap = ArgumentParser("Script for training DocsGPT on .rst documentation files.")
-ap.add_argument("-i", "--inputs",
-                type=str,
-                default="inputs",
-                help="Directory containing documentation files")
-args = ap.parse_args()
-
-# Here we load in the data in the format that Notion exports it in.
-ps = list(Path(args.inputs).glob("**/*.rst"))
-
-# parse all child directories
-data = []
-sources = []
-for p in ps:
-    with open(p) as f:
-        data.append(f.read())
-    sources.append(p)
-
-# Here we split the documents, as needed, into smaller chunks.
-# We do this due to the context limits of the LLMs.
-text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
-docs = []
-metadatas = []
-for i, d in enumerate(data):
-    splits = text_splitter.split_text(d)
-    docs.extend(splits)
-    metadatas.extend([{"source": sources[i]}] * len(splits))
-
-# Here we check for command line arguments for bot calls.
-# If no argument exists or the permission_bypass_flag argument is not '-y',
-# user permission is requested to call the API.
-if len(sys.argv) > 1:
-    permission_bypass_flag = sys.argv[1]
-    if permission_bypass_flag == '-y':
-        call_openai_api()
-    else:
-        get_user_permission()
-else:
-    get_user_permission()
--- a/scripts/old/ingest_rst_sphinx.py
+++ b/scripts/old/ingest_rst_sphinx.py
@@ -1,133 +0,0 @@
-import os
-import pickle
-import shutil
-import sys
-from argparse import ArgumentParser
-from pathlib import Path
-
-import dotenv
-import faiss
-import tiktoken
-from langchain_openai import OpenAIEmbeddings
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.vectorstores import FAISS
-from sphinx.cmd.build import main as sphinx_main
-
-
-def convert_rst_to_txt(src_dir, dst_dir):
-    # Check if the source directory exists
-    if not os.path.exists(src_dir):
-        raise Exception("Source directory does not exist")
-    # Walk through the source directory
-    for root, dirs, files in os.walk(src_dir):
-        for file in files:
-            # Check if the file has .rst extension
-            if file.endswith(".rst"):
-                # Construct the full path of the file
-                src_file = os.path.join(root, file.replace(".rst", ""))
-                # Convert the .rst file to .txt file using sphinx-build
-                args = f". -b text -D extensions=sphinx.ext.autodoc " \
-                       f"-D master_doc={src_file} " \
-                       f"-D source_suffix=.rst " \
-                       f"-C {dst_dir} "
-                sphinx_main(args.split())
-            elif file.endswith(".md"):
-                # Rename the .md file to .rst file
-                src_file = os.path.join(root, file)
-                dst_file = os.path.join(root, file.replace(".md", ".rst"))
-                os.rename(src_file, dst_file)
-                # Convert the .rst file to .txt file using sphinx-build
-                args = f". -b text -D extensions=sphinx.ext.autodoc " \
-                       f"-D master_doc={dst_file} " \
-                       f"-D source_suffix=.rst " \
-                       f"-C {dst_dir} "
-                sphinx_main(args.split())
-
-
-def num_tokens_from_string(string: str, encoding_name: str) -> int:
-    # Function to convert string to tokens and estimate user cost.
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    total_price = ((num_tokens / 1000) * 0.0004)
-    return num_tokens, total_price
-
-
-def call_openai_api():
-    # Function to create a vector store from the documents and save it to disk.
-    store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
-    faiss.write_index(store.index, "docs.index")
-    store.index = None
-    with open("faiss_store.pkl", "wb") as f:
-        pickle.dump(store, f)
-
-
-def get_user_permission():
-    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
-    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
-    docs_content = (" ".join(docs))
-    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
-    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
-    print(f"Number of Tokens = {format(tokens, ',d')}")
-    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    # Here we check for user permission before calling the API.
-    user_input = input("Price Okay? (Y/N) \n").lower()
-    if user_input == "y":
-        call_openai_api()
-    elif user_input == "":
-        call_openai_api()
-    else:
-        print("The API was not called. No money was spent.")
-
-
-ap = ArgumentParser("Script for training DocsGPT on Sphinx documentation")
-ap.add_argument("-i", "--inputs",
-                type=str,
-                default="inputs",
-                help="Directory containing documentation files")
-args = ap.parse_args()
-
-# Load .env file
-dotenv.load_dotenv()
-
-# Directory to vector
-src_dir = args.inputs
-dst_dir = "tmp"
-
-convert_rst_to_txt(src_dir, dst_dir)
-
-# Here we load in the data in the format that Notion exports it in.
-ps = list(Path("tmp/" + src_dir).glob("**/*.txt"))
-
-# parse all child directories
-data = []
-sources = []
-for p in ps:
-    with open(p) as f:
-        data.append(f.read())
-    sources.append(p)
-
-# Here we split the documents, as needed, into smaller chunks.
-# We do this due to the context limits of the LLMs.
-text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
-docs = []
-metadatas = []
-for i, d in enumerate(data):
-    splits = text_splitter.split_text(d)
-    docs.extend(splits)
-    metadatas.extend([{"source": sources[i]}] * len(splits))
-
-# Here we check for command line arguments for bot calls.
-# If no argument exists or the permission_bypass_flag argument is not '-y',
-# user permission is requested to call the API.
-if len(sys.argv) > 1:
-    permission_bypass_flag = sys.argv[1]
-    if permission_bypass_flag == '-y':
-        call_openai_api()
-    else:
-        get_user_permission()
-else:
-    get_user_permission()
-
-# Delete tmp folder
-# Commented out for now
-shutil.rmtree(dst_dir)
--- a/scripts/parser/init.py
+++ b/scripts/parser/init.py
@@ -1 +0,0 @@
-
--- a/scripts/parser/file/init.py
+++ b/scripts/parser/file/init.py
--- a/scripts/parser/file/base.py
+++ b/scripts/parser/file/base.py
@@ -1,19 +0,0 @@
-"""Base reader class."""
-from abc import abstractmethod
-from typing import Any, List
-
-from langchain.docstore.document import Document as LCDocument
-from parser.schema.base import Document
-
-
-class BaseReader:
-    """Utilities for loading data from a directory."""
-
-    @abstractmethod
-    def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
-        """Load data from the input directory."""
-
-    def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
-        """Load data in LangChain document format."""
-        docs = self.load_data(**load_kwargs)
-        return [d.to_langchain_format() for d in docs]
--- a/scripts/parser/file/base_parser.py
+++ b/scripts/parser/file/base_parser.py
@@ -1,38 +0,0 @@
-"""Base parser and config class."""
-
-from abc import abstractmethod
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-
-class BaseParser:
-    """Base class for all parsers."""
-
-    def __init__(self, parser_config: Optional[Dict] = None):
-        """Init params."""
-        self._parser_config = parser_config
-
-    def init_parser(self) -> None:
-        """Init parser and store it."""
-        parser_config = self._init_parser()
-        self._parser_config = parser_config
-
-    @property
-    def parser_config_set(self) -> bool:
-        """Check if parser config is set."""
-        return self._parser_config is not None
-
-    @property
-    def parser_config(self) -> Dict:
-        """Check if parser config is set."""
-        if self._parser_config is None:
-            raise ValueError("Parser config not set.")
-        return self._parser_config
-
-    @abstractmethod
-    def _init_parser(self) -> Dict:
-        """Initialize the parser with the config."""
-
-    @abstractmethod
-    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
-        """Parse file."""
--- a/scripts/parser/file/bulk.py
+++ b/scripts/parser/file/bulk.py
@@ -1,167 +0,0 @@
-"""Simple reader that reads files of different formats from a directory."""
-import logging
-from parser.file.base import BaseReader
-from parser.file.base_parser import BaseParser
-from parser.file.docs_parser import DocxParser, PDFParser
-from parser.file.epub_parser import EpubParser
-from parser.file.html_parser import HTMLParser
-from parser.file.markdown_parser import MarkdownParser
-from parser.file.rst_parser import RstParser
-from parser.file.tabular_parser import PandasCSVParser
-from parser.schema.base import Document
-from pathlib import Path
-from typing import Callable, Dict, List, Optional, Union
-
-DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
-    ".pdf": PDFParser(),
-    ".docx": DocxParser(),
-    ".csv": PandasCSVParser(),
-    ".epub": EpubParser(),
-    ".md": MarkdownParser(),
-    ".rst": RstParser(),
-    ".html": HTMLParser(),
-    ".mdx": MarkdownParser(),
-}
-
-
-class SimpleDirectoryReader(BaseReader):
-    """Simple directory reader.
-
-    Can read files into separate documents, or concatenates
-    files into one document text.
-
-    Args:
-        input_dir (str): Path to the directory.
-        input_files (List): List of file paths to read (Optional; overrides input_dir)
-        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
-        errors (str): how encoding and decoding errors are to be handled,
-              see https://docs.python.org/3/library/functions.html#open
-        recursive (bool): Whether to recursively search in subdirectories.
-            False by default.
-        required_exts (Optional[List[str]]): List of required extensions.
-            Default is None.
-        file_extractor (Optional[Dict[str, BaseParser]]): A mapping of file
-            extension to a BaseParser class that specifies how to convert that file
-            to text. See DEFAULT_FILE_EXTRACTOR.
-        num_files_limit (Optional[int]): Maximum number of files to read.
-            Default is None.
-        file_metadata (Optional[Callable[str, Dict]]): A function that takes
-            in a filename and returns a Dict of metadata for the Document.
-            Default is None.
-    """
-
-    def __init__(
-            self,
-            input_dir: Optional[str] = None,
-            input_files: Optional[List] = None,
-            exclude_hidden: bool = True,
-            errors: str = "ignore",
-            recursive: bool = True,
-            required_exts: Optional[List[str]] = None,
-            file_extractor: Optional[Dict[str, BaseParser]] = None,
-            num_files_limit: Optional[int] = None,
-            file_metadata: Optional[Callable[[str], Dict]] = None,
-    ) -> None:
-        """Initialize with parameters."""
-        super().__init__()
-
-        if not input_dir and not input_files:
-            raise ValueError("Must provide either `input_dir` or `input_files`.")
-
-        self.errors = errors
-
-        self.recursive = recursive
-        self.exclude_hidden = exclude_hidden
-        self.required_exts = required_exts
-        self.num_files_limit = num_files_limit
-        print("input_files")
-        print(input_files)
-
-        if input_files:
-            self.input_files = []
-            for path in input_files:
-                input_file = Path(path)
-                self.input_files.append(input_file)
-        elif input_dir:
-            self.input_dir = Path(input_dir)
-            self.input_files = self._add_files(self.input_dir)
-
-        self.file_extractor = file_extractor or DEFAULT_FILE_EXTRACTOR
-        self.file_metadata = file_metadata
-
-    def _add_files(self, input_dir: Path) -> List[Path]:
-        """Add files."""
-        input_files = sorted(input_dir.iterdir())
-        new_input_files = []
-        dirs_to_explore = []
-        for input_file in input_files:
-            if input_file.is_dir():
-                if self.recursive:
-                    dirs_to_explore.append(input_file)
-            elif self.exclude_hidden and input_file.name.startswith("."):
-                continue
-            elif (
-                    self.required_exts is not None
-                    and input_file.suffix not in self.required_exts
-            ):
-                continue
-            else:
-                new_input_files.append(input_file)
-
-        for dir_to_explore in dirs_to_explore:
-            sub_input_files = self._add_files(dir_to_explore)
-            new_input_files.extend(sub_input_files)
-
-        if self.num_files_limit is not None and self.num_files_limit > 0:
-            new_input_files = new_input_files[0: self.num_files_limit]
-
-        # print total number of files added
-        logging.debug(
-            f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}"
-        )
-
-        return new_input_files
-
-    def load_data(self, concatenate: bool = False) -> List[Document]:
-        """Load data from the input directory.
-
-        Args:
-            concatenate (bool): whether to concatenate all files into one document.
-                If set to True, file metadata is ignored.
-                False by default.
-
-        Returns:
-            List[Document]: A list of documents.
-
-        """
-        data: Union[str, List[str]] = ""
-        data_list: List[str] = []
-        metadata_list = []
-        for input_file in self.input_files:
-            if input_file.suffix in self.file_extractor:
-                parser = self.file_extractor[input_file.suffix]
-                if not parser.parser_config_set:
-                    parser.init_parser()
-                data = parser.parse_file(input_file, errors=self.errors)
-            else:
-                # do standard read
-                with open(input_file, "r", errors=self.errors) as f:
-                    data = f.read()
-            if isinstance(data, List):
-                data_list.extend(data)
-                if self.file_metadata is not None:
-                    for _ in range(len(data)):
-                        metadata_list.append(self.file_metadata(str(input_file)))
-            else:
-                data_list.append(str(data))
-                if self.file_metadata is not None:
-                    metadata_list.append(self.file_metadata(str(input_file)))
-
-            
-
-        if concatenate:
-            return [Document("\n".join(data_list))]
-        elif self.file_metadata is not None:
-            return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)]
-        else:
-            return [Document(d) for d in data_list]
--- a/scripts/parser/file/docs_parser.py
+++ b/scripts/parser/file/docs_parser.py
@@ -1,59 +0,0 @@
-"""Docs parser.
-
-Contains parsers for docx, pdf files.
-
-"""
-from pathlib import Path
-from typing import Dict
-
-from parser.file.base_parser import BaseParser
-
-
-class PDFParser(BaseParser):
-    """PDF parser."""
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> str:
-        """Parse file."""
-        try:
-            import PyPDF2
-        except ImportError:
-            raise ValueError("PyPDF2 is required to read PDF files.")
-        text_list = []
-        with open(file, "rb") as fp:
-            # Create a PDF object
-            pdf = PyPDF2.PdfReader(fp)
-
-            # Get the number of pages in the PDF document
-            num_pages = len(pdf.pages)
-
-            # Iterate over every page
-            for page in range(num_pages):
-                # Extract the text from the page
-                page_text = pdf.pages[page].extract_text()
-                text_list.append(page_text)
-        text = "\n".join(text_list)
-
-        return text
-
-
-class DocxParser(BaseParser):
-    """Docx parser."""
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> str:
-        """Parse file."""
-        try:
-            import docx2txt
-        except ImportError:
-            raise ValueError("docx2txt is required to read Microsoft Word files.")
-
-        text = docx2txt.process(file)
-
-        return text
--- a/scripts/parser/file/epub_parser.py
+++ b/scripts/parser/file/epub_parser.py
@@ -1,43 +0,0 @@
-"""Epub parser.
-
-Contains parsers for epub files.
-"""
-
-from pathlib import Path
-from typing import Dict
-
-from parser.file.base_parser import BaseParser
-
-
-class EpubParser(BaseParser):
-    """Epub Parser."""
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> str:
-        """Parse file."""
-        try:
-            import ebooklib
-            from ebooklib import epub
-        except ImportError:
-            raise ValueError("`EbookLib` is required to read Epub files.")
-        try:
-            import html2text
-        except ImportError:
-            raise ValueError("`html2text` is required to parse Epub files.")
-
-        text_list = []
-        book = epub.read_epub(file, options={"ignore_ncx": True})
-
-        # Iterate through all chapters.
-        for item in book.get_items():
-            # Chapters are typically located in epub documents items.
-            if item.get_type() == ebooklib.ITEM_DOCUMENT:
-                text_list.append(
-                    html2text.html2text(item.get_content().decode("utf-8"))
-                )
-
-        text = "\n".join(text_list)
-        return text
--- a/scripts/parser/file/html_parser.py
+++ b/scripts/parser/file/html_parser.py
@@ -1,83 +0,0 @@
-"""HTML parser.
-
-Contains parser for html files.
-
-"""
-import re
-from pathlib import Path
-from typing import Dict, Union
-
-from parser.file.base_parser import BaseParser
-
-
-class HTMLParser(BaseParser):
-    """HTML parser."""
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
-        """Parse file.
-
-            Returns:
-            Union[str, List[str]]: a string or a List of strings.
-        """
-        try:
-            from unstructured.partition.html import partition_html
-            from unstructured.staging.base import convert_to_isd
-            from unstructured.cleaners.core import clean
-        except ImportError:
-            raise ValueError("unstructured package is required to parse HTML files.")
-
-        # Using the unstructured library to convert the html to isd format
-        # isd sample : isd = [
-        #   {"text": "My Title", "type": "Title"},
-        #   {"text": "My Narrative", "type": "NarrativeText"}
-        # ]
-        with open(file, "r", encoding="utf-8") as fp:
-            elements = partition_html(file=fp)
-            isd = convert_to_isd(elements)
-
-        # Removing non ascii charactwers from isd_el['text']
-        for isd_el in isd:
-            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
-
-        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
-        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
-        for isd_el in isd:
-            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
-            isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
-
-        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
-        for isd_el in isd:
-            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
-
-        # Creating a list of all the indexes of isd_el['type'] = 'Title'
-        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
-
-        # Creating 'Chunks' - List of lists of strings 
-        # each list starting with isd_el['type'] = 'Title' and all the data till the next 'Title'
-        # Each Chunk can be thought of as an individual set of data, which can be sent to the model
-        # Where Each Title is grouped together with the data under it
-
-        Chunks = [[]]
-        final_chunks = list(list())
-
-        for i, isd_el in enumerate(isd):
-            if i in title_indexes:
-                Chunks.append([])
-            Chunks[-1].append(isd_el['text'])
-
-        # Removing all the chunks with sum of length of all the strings in the chunk < 25
-        # TODO: This value can be a user defined variable
-        for chunk in Chunks:
-            # sum of length of all the strings in the chunk
-            sum = 0
-            sum += len(str(chunk))
-            if sum < 25:
-                Chunks.remove(chunk)
-            else:
-                # appending all the approved chunks to final_chunks as a single string       
-                final_chunks.append(" ".join([str(item) for item in chunk]))
-        return final_chunks
--- a/scripts/parser/file/markdown_parser.py
+++ b/scripts/parser/file/markdown_parser.py
@@ -1,149 +0,0 @@
-"""Markdown parser.
-
-Contains parser for md files.
-
-"""
-import re
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
-
-import tiktoken
-from parser.file.base_parser import BaseParser
-
-
-class MarkdownParser(BaseParser):
-    """Markdown parser.
-
-    Extract text from markdown files.
-    Returns dictionary with keys as headers and values as the text between headers.
-
-    """
-
-    def __init__(
-            self,
-            *args: Any,
-            remove_hyperlinks: bool = True,
-            remove_images: bool = True,
-            max_tokens: int = 2048,
-            # remove_tables: bool = True,
-            **kwargs: Any,
-    ) -> None:
-        """Init params."""
-        super().__init__(*args, **kwargs)
-        self._remove_hyperlinks = remove_hyperlinks
-        self._remove_images = remove_images
-        self._max_tokens = max_tokens
-        # self._remove_tables = remove_tables
-
-    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
-                          current_text: str):
-        """Append to tups chunk."""
-        num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
-        if num_tokens > self._max_tokens:
-            chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)]
-            for chunk in chunks:
-                tups.append((current_header, chunk))
-        else:
-            tups.append((current_header, current_text))
-        return tups
-
-    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
-        """Convert a markdown file to a dictionary.
-
-        The keys are the headers and the values are the text under each header.
-
-        """
-        markdown_tups: List[Tuple[Optional[str], str]] = []
-        lines = markdown_text.split("\n")
-
-        current_header = None
-        current_text = ""
-
-        for line in lines:
-            header_match = re.match(r"^#+\s", line)
-            if header_match:
-                if current_header is not None:
-                    if current_text == "" or None:
-                        continue
-                    markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text)
-
-                current_header = line
-                current_text = ""
-            else:
-                current_text += line + "\n"
-        markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text)
-
-        if current_header is not None:
-            # pass linting, assert keys are defined
-            markdown_tups = [
-                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
-                for key, value in markdown_tups
-            ]
-        else:
-            markdown_tups = [
-                (key, re.sub("\n", "", value)) for key, value in markdown_tups
-            ]
-
-        return markdown_tups
-
-    def remove_images(self, content: str) -> str:
-        """Get a dictionary of a markdown file from its path."""
-        pattern = r"!{1}\[\[(.*)\]\]"
-        content = re.sub(pattern, "", content)
-        return content
-
-    # def remove_tables(self, content: str) -> List[List[str]]:
-    #     """Convert markdown tables to nested lists."""
-    #     table_rows_pattern = r"((\r?\n){2}|^)([^\r\n]*\|[^\r\n]*(\r?\n)?)+(?=(\r?\n){2}|$)"
-    #     table_cells_pattern = r"([^\|\r\n]*)\|"
-    #
-    #     table_rows = re.findall(table_rows_pattern, content, re.MULTILINE)
-    #     table_lists = []
-    #     for row in table_rows:
-    #         cells = re.findall(table_cells_pattern, row[2])
-    #         cells = [cell.strip() for cell in cells if cell.strip()]
-    #         table_lists.append(cells)
-    #     return str(table_lists)
-
-    def remove_hyperlinks(self, content: str) -> str:
-        """Get a dictionary of a markdown file from its path."""
-        pattern = r"\[(.*?)\]\((.*?)\)"
-        content = re.sub(pattern, r"\1", content)
-        return content
-
-    def _init_parser(self) -> Dict:
-        """Initialize the parser with the config."""
-        return {}
-
-    def parse_tups(
-            self, filepath: Path, errors: str = "ignore"
-    ) -> List[Tuple[Optional[str], str]]:
-        """Parse file into tuples."""
-        with open(filepath, "r", encoding='utf8') as f:
-            try:
-                content = f.read()
-            except (Exception,) as e:
-                print(f'Error a file: "{filepath}"')
-                raise e
-        if self._remove_hyperlinks:
-            content = self.remove_hyperlinks(content)
-        if self._remove_images:
-            content = self.remove_images(content)
-        # if self._remove_tables:
-        #     content = self.remove_tables(content)
-        markdown_tups = self.markdown_to_tups(content)
-        return markdown_tups
-
-    def parse_file(
-            self, filepath: Path, errors: str = "ignore"
-    ) -> Union[str, List[str]]:
-        """Parse file into string."""
-        tups = self.parse_tups(filepath, errors=errors)
-        results = []
-        # TODO: don't include headers right now
-        for header, value in tups:
-            if header is None:
-                results.append(value)
-            else:
-                results.append(f"\n\n{header}\n{value}")
-        return results
--- a/scripts/parser/file/openapi3_parser.py
+++ b/scripts/parser/file/openapi3_parser.py
@@ -1,51 +0,0 @@
-from urllib.parse import urlparse
-
-from openapi_parser import parse
-
-try:
-    from scripts.parser.file.base_parser import BaseParser
-except ModuleNotFoundError:
-    from base_parser import BaseParser
-
-
-class OpenAPI3Parser(BaseParser):
-    def init_parser(self) -> None:
-        return super().init_parser()
-
-    def get_base_urls(self, urls):
-        base_urls = []
-        for i in urls:
-            parsed_url = urlparse(i)
-            base_url = parsed_url.scheme + "://" + parsed_url.netloc
-            if base_url not in base_urls:
-                base_urls.append(base_url)
-        return base_urls
-
-    def get_info_from_paths(self, path):
-        info = ""
-        if path.operations:
-            for operation in path.operations:
-                info += (
-                    f"\n{operation.method.value}="
-                    f"{operation.responses[0].description}"
-                )
-        return info
-
-    def parse_file(self, file_path):
-        data = parse(file_path)
-        results = ""
-        base_urls = self.get_base_urls(link.url for link in data.servers)
-        base_urls = ",".join([base_url for base_url in base_urls])
-        results += f"Base URL:{base_urls}\n"
-        i = 1
-        for path in data.paths:
-            info = self.get_info_from_paths(path)
-            results += (
-                f"Path{i}: {path.url}\n"
-                f"description: {path.description}\n"
-                f"parameters: {path.parameters}\nmethods: {info}\n"
-            )
-            i += 1
-        with open("results.txt", "w") as f:
-            f.write(results)
-        return results
--- a/scripts/parser/file/rst_parser.py
+++ b/scripts/parser/file/rst_parser.py
@@ -1,173 +0,0 @@
-"""reStructuredText parser.
-
-Contains parser for md files.
-
-"""
-import re
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from parser.file.base_parser import BaseParser
-
-
-class RstParser(BaseParser):
-    """reStructuredText parser.
-
-    Extract text from .rst files.
-    Returns dictionary with keys as headers and values as the text between headers.
-
-    """
-
-    def __init__(
-            self,
-            *args: Any,
-            remove_hyperlinks: bool = True,
-            remove_images: bool = True,
-            remove_table_excess: bool = True,
-            remove_interpreters: bool = True,
-            remove_directives: bool = True,
-            remove_whitespaces_excess: bool = True,
-            # Be careful with remove_characters_excess, might cause data loss
-            remove_characters_excess: bool = True,
-            **kwargs: Any,
-    ) -> None:
-        """Init params."""
-        super().__init__(*args, **kwargs)
-        self._remove_hyperlinks = remove_hyperlinks
-        self._remove_images = remove_images
-        self._remove_table_excess = remove_table_excess
-        self._remove_interpreters = remove_interpreters
-        self._remove_directives = remove_directives
-        self._remove_whitespaces_excess = remove_whitespaces_excess
-        self._remove_characters_excess = remove_characters_excess
-
-    def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
-        """Convert a reStructuredText file to a dictionary.
-
-        The keys are the headers and the values are the text under each header.
-
-        """
-        rst_tups: List[Tuple[Optional[str], str]] = []
-        lines = rst_text.split("\n")
-
-        current_header = None
-        current_text = ""
-
-        for i, line in enumerate(lines):
-            header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
-            if header_match and i > 0 and (
-                    len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
-                if current_header is not None:
-                    if current_text == "" or None:
-                        continue
-                    # removes the next heading from current Document
-                    if current_text.endswith(lines[i - 1] + "\n"):
-                        current_text = current_text[:len(current_text) - len(lines[i - 1] + "\n")]
-                    rst_tups.append((current_header, current_text))
-
-                current_header = lines[i - 1]
-                current_text = ""
-            else:
-                current_text += line + "\n"
-
-        rst_tups.append((current_header, current_text))
-
-        # TODO: Format for rst
-        #
-        # if current_header is not None:
-        #     # pass linting, assert keys are defined
-        #     rst_tups = [
-        #         (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
-        #         for key, value in rst_tups
-        #     ]
-        # else:
-        #     rst_tups = [
-        #         (key, re.sub("\n", "", value)) for key, value in rst_tups
-        #     ]
-
-        if current_header is None:
-            rst_tups = [
-                (key, re.sub("\n", "", value)) for key, value in rst_tups
-            ]
-        return rst_tups
-
-    def remove_images(self, content: str) -> str:
-        pattern = r"\.\. image:: (.*)"
-        content = re.sub(pattern, "", content)
-        return content
-
-    def remove_hyperlinks(self, content: str) -> str:
-        pattern = r"`(.*?) <(.*?)>`_"
-        content = re.sub(pattern, r"\1", content)
-        return content
-
-    def remove_directives(self, content: str) -> str:
-        """Removes reStructuredText Directives"""
-        pattern = r"`\.\.([^:]+)::"
-        content = re.sub(pattern, "", content)
-        return content
-
-    def remove_interpreters(self, content: str) -> str:
-        """Removes reStructuredText Interpreted Text Roles"""
-        pattern = r":(\w+):"
-        content = re.sub(pattern, "", content)
-        return content
-
-    def remove_table_excess(self, content: str) -> str:
-        """Pattern to remove grid table separators"""
-        pattern = r"^\+[-]+\+[-]+\+$"
-        content = re.sub(pattern, "", content, flags=re.MULTILINE)
-        return content
-
-    def remove_whitespaces_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]:
-        """Pattern to match 2 or more consecutive whitespaces"""
-        pattern = r"\s{2,}"
-        content = [(key, re.sub(pattern, "  ", value)) for key, value in content]
-        return content
-
-    def remove_characters_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]:
-        """Pattern to match 2 or more consecutive characters"""
-        pattern = r"(\S)\1{2,}"
-        content = [(key, re.sub(pattern, r"\1\1\1", value, flags=re.MULTILINE)) for key, value in content]
-        return content
-
-    def _init_parser(self) -> Dict:
-        """Initialize the parser with the config."""
-        return {}
-
-    def parse_tups(
-            self, filepath: Path, errors: str = "ignore"
-    ) -> List[Tuple[Optional[str], str]]:
-        """Parse file into tuples."""
-        with open(filepath, "r") as f:
-            content = f.read()
-        if self._remove_hyperlinks:
-            content = self.remove_hyperlinks(content)
-        if self._remove_images:
-            content = self.remove_images(content)
-        if self._remove_table_excess:
-            content = self.remove_table_excess(content)
-        if self._remove_directives:
-            content = self.remove_directives(content)
-        if self._remove_interpreters:
-            content = self.remove_interpreters(content)
-        rst_tups = self.rst_to_tups(content)
-        if self._remove_whitespaces_excess:
-            rst_tups = self.remove_whitespaces_excess(rst_tups)
-        if self._remove_characters_excess:
-            rst_tups = self.remove_characters_excess(rst_tups)
-        return rst_tups
-
-    def parse_file(
-            self, filepath: Path, errors: str = "ignore"
-    ) -> Union[str, List[str]]:
-        """Parse file into string."""
-        tups = self.parse_tups(filepath, errors=errors)
-        results = []
-        # TODO: don't include headers right now
-        for header, value in tups:
-            if header is None:
-                results.append(value)
-            else:
-                results.append(f"\n\n{header}\n{value}")
-        return results
--- a/scripts/parser/file/tabular_parser.py
+++ b/scripts/parser/file/tabular_parser.py
@@ -1,115 +0,0 @@
-"""Tabular parser.
-
-Contains parsers for tabular data files.
-
-"""
-from pathlib import Path
-from typing import Any, Dict, List, Union
-
-from parser.file.base_parser import BaseParser
-
-
-class CSVParser(BaseParser):
-    """CSV parser.
-
-    Args:
-        concat_rows (bool): whether to concatenate all rows into one document.
-            If set to False, a Document will be created for each row.
-            True by default.
-
-    """
-
-    def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None:
-        """Init params."""
-        super().__init__(*args, **kwargs)
-        self._concat_rows = concat_rows
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
-        """Parse file.
-
-        Returns:
-            Union[str, List[str]]: a string or a List of strings.
-
-        """
-        try:
-            import csv
-        except ImportError:
-            raise ValueError("csv module is required to read CSV files.")
-        text_list = []
-        with open(file, "r") as fp:
-            csv_reader = csv.reader(fp)
-            for row in csv_reader:
-                text_list.append(", ".join(row))
-        if self._concat_rows:
-            return "\n".join(text_list)
-        else:
-            return text_list
-
-
-class PandasCSVParser(BaseParser):
-    r"""Pandas-based CSV parser.
-
-    Parses CSVs using the separator detection from Pandas `read_csv`function.
-    If special parameters are required, use the `pandas_config` dict.
-
-    Args:
-        concat_rows (bool): whether to concatenate all rows into one document.
-            If set to False, a Document will be created for each row.
-            True by default.
-
-        col_joiner (str): Separator to use for joining cols per row.
-            Set to ", " by default.
-
-        row_joiner (str): Separator to use for joining each row.
-            Only used when `concat_rows=True`.
-            Set to "\n" by default.
-
-        pandas_config (dict): Options for the `pandas.read_csv` function call.
-            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
-            for more information.
-            Set to empty dict by default, this means pandas will try to figure
-            out the separators, table head, etc. on its own.
-
-    """
-
-    def __init__(
-            self,
-            *args: Any,
-            concat_rows: bool = True,
-            col_joiner: str = ", ",
-            row_joiner: str = "\n",
-            pandas_config: dict = {},
-            **kwargs: Any
-    ) -> None:
-        """Init params."""
-        super().__init__(*args, **kwargs)
-        self._concat_rows = concat_rows
-        self._col_joiner = col_joiner
-        self._row_joiner = row_joiner
-        self._pandas_config = pandas_config
-
-    def _init_parser(self) -> Dict:
-        """Init parser."""
-        return {}
-
-    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
-        """Parse file."""
-        try:
-            import pandas as pd
-        except ImportError:
-            raise ValueError("pandas module is required to read CSV files.")
-
-        df = pd.read_csv(file, **self._pandas_config)
-
-        text_list = df.apply(
-            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
-        ).tolist()
-
-        if self._concat_rows:
-            return (self._row_joiner).join(text_list)
-        else:
-            return text_list
--- a/scripts/parser/java2doc.py
+++ b/scripts/parser/java2doc.py
@@ -1,66 +0,0 @@
-import os
-
-import javalang
-
-
-def find_files(directory):
-    files_list = []
-    for root, dirs, files in os.walk(directory):
-        for file in files:
-            if file.endswith('.java'):
-                files_list.append(os.path.join(root, file))
-    return files_list
-
-
-def extract_functions(file_path):
-    with open(file_path, "r") as file:
-        java_code = file.read()
-        methods = {}
-        tree = javalang.parse.parse(java_code)
-        for _, node in tree.filter(javalang.tree.MethodDeclaration):
-            method_name = node.name
-            start_line = node.position.line - 1
-            end_line = start_line
-            brace_count = 0
-            for line in java_code.splitlines()[start_line:]:
-                end_line += 1
-                brace_count += line.count("{") - line.count("}")
-                if brace_count == 0:
-                    break
-            method_source_code = "\n".join(java_code.splitlines()[start_line:end_line])
-            methods[method_name] = method_source_code
-    return methods
-
-
-def extract_classes(file_path):
-    with open(file_path, 'r') as file:
-        source_code = file.read()
-        classes = {}
-        tree = javalang.parse.parse(source_code)
-        for class_decl in tree.types:
-            class_name = class_decl.name
-            declarations = []
-            methods = []
-            for field_decl in class_decl.fields:
-                field_name = field_decl.declarators[0].name
-                field_type = field_decl.type.name
-                declarations.append(f"{field_type} {field_name}")
-            for method_decl in class_decl.methods:
-                methods.append(method_decl.name)
-            class_string = "Declarations: " + ", ".join(declarations) + "\n  Method name: " + ", ".join(methods)
-            classes[class_name] = class_string
-    return classes
-
-
-def extract_functions_and_classes(directory):
-    files = find_files(directory)
-    functions_dict = {}
-    classes_dict = {}
-    for file in files:
-        functions = extract_functions(file)
-        if functions:
-            functions_dict[file] = functions
-        classes = extract_classes(file)
-        if classes:
-            classes_dict[file] = classes
-    return functions_dict, classes_dict
--- a/scripts/parser/js2doc.py
+++ b/scripts/parser/js2doc.py
@@ -1,70 +0,0 @@
-import os
-
-import escodegen
-import esprima
-
-
-def find_files(directory):
-    files_list = []
-    for root, dirs, files in os.walk(directory):
-        for file in files:
-            if file.endswith('.js'):
-                files_list.append(os.path.join(root, file))
-    return files_list
-
-
-def extract_functions(file_path):
-    with open(file_path, 'r') as file:
-        source_code = file.read()
-        functions = {}
-        tree = esprima.parseScript(source_code)
-        for node in tree.body:
-            if node.type == 'FunctionDeclaration':
-                func_name = node.id.name if node.id else '<anonymous>'
-                functions[func_name] = escodegen.generate(node)
-            elif node.type == 'VariableDeclaration':
-                for declaration in node.declarations:
-                    if declaration.init and declaration.init.type == 'FunctionExpression':
-                        func_name = declaration.id.name if declaration.id else '<anonymous>'
-                        functions[func_name] = escodegen.generate(declaration.init)
-            elif node.type == 'ClassDeclaration':
-                for subnode in node.body.body:
-                    if subnode.type == 'MethodDefinition':
-                        func_name = subnode.key.name
-                        functions[func_name] = escodegen.generate(subnode.value)
-                    elif subnode.type == 'VariableDeclaration':
-                        for declaration in subnode.declarations:
-                            if declaration.init and declaration.init.type == 'FunctionExpression':
-                                func_name = declaration.id.name if declaration.id else '<anonymous>'
-                                functions[func_name] = escodegen.generate(declaration.init)
-        return functions
-
-
-def extract_classes(file_path):
-    with open(file_path, 'r') as file:
-        source_code = file.read()
-        classes = {}
-        tree = esprima.parseScript(source_code)
-        for node in tree.body:
-            if node.type == 'ClassDeclaration':
-                class_name = node.id.name
-                function_names = []
-                for subnode in node.body.body:
-                    if subnode.type == 'MethodDefinition':
-                        function_names.append(subnode.key.name)
-                classes[class_name] = ", ".join(function_names)
-    return classes
-
-
-def extract_functions_and_classes(directory):
-    files = find_files(directory)
-    functions_dict = {}
-    classes_dict = {}
-    for file in files:
-        functions = extract_functions(file)
-        if functions:
-            functions_dict[file] = functions
-        classes = extract_classes(file)
-        if classes:
-            classes_dict[file] = classes
-    return functions_dict, classes_dict
--- a/scripts/parser/open_ai_func.py
+++ b/scripts/parser/open_ai_func.py
@@ -1,100 +0,0 @@
-import os
-
-import tiktoken
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.vectorstores import FAISS
-from retry import retry
-
-
-# from langchain.embeddings import HuggingFaceEmbeddings
-# from langchain.embeddings import HuggingFaceInstructEmbeddings
-# from langchain.embeddings import CohereEmbeddings
-
-
-def num_tokens_from_string(string: str, encoding_name: str) -> tuple[int, float]:
-    # Function to convert string to tokens and estimate user cost.
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    total_price = (num_tokens / 1000) * 0.0004
-    return num_tokens, total_price
-
-
-@retry(tries=10, delay=60)
-def store_add_texts_with_retry(store, i):
-    store.add_texts([i.page_content], metadatas=[i.metadata])
-    # store_pine.add_texts([i.page_content], metadatas=[i.metadata])
-
-
-def call_openai_api(docs, folder_name):
-    # Function to create a vector store from the documents and save it to disk.
-
-    # create output folder if it doesn't exist
-    if not os.path.exists(f"outputs/{folder_name}"):
-        os.makedirs(f"outputs/{folder_name}")
-
-    from tqdm import tqdm
-
-    docs_test = [docs[0]]
-    # remove the first element from docs
-    docs.pop(0)
-    # cut first n docs if you want to restart
-    # docs = docs[:n]
-    c1 = 0
-    # pinecone.init(
-    #     api_key="",  # find at app.pinecone.io
-    #     environment="us-east1-gcp"  # next to api key in console
-    # )
-    # index_name = "pandas"
-    if (  # azure
-        os.environ.get("OPENAI_API_BASE")
-        and os.environ.get("OPENAI_API_VERSION")
-        and os.environ.get("AZURE_DEPLOYMENT_NAME")
-        and os.environ.get("AZURE_EMBEDDINGS_DEPLOYMENT_NAME")
-    ):
-        os.environ["OPENAI_API_TYPE"] = "azure"
-        openai_embeddings = OpenAIEmbeddings(model=os.environ.get("AZURE_EMBEDDINGS_DEPLOYMENT_NAME"))
-    else:
-        openai_embeddings = OpenAIEmbeddings()
-    store = FAISS.from_documents(docs_test, openai_embeddings)
-    # store_pine = Pinecone.from_documents(docs_test, OpenAIEmbeddings(), index_name=index_name)
-
-    # Uncomment for MPNet embeddings
-    # model_name = "sentence-transformers/all-mpnet-base-v2"
-    # hf = HuggingFaceEmbeddings(model_name=model_name)
-    # store = FAISS.from_documents(docs_test, hf)
-    for i in tqdm(
-        docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format="{l_bar}{bar}| Time Left: {remaining}"
-    ):
-        try:
-            store_add_texts_with_retry(store, i)
-        except Exception as e:
-            print(e)
-            print("Error on ", i)
-            print("Saving progress")
-            print(f"stopped at {c1} out of {len(docs)}")
-            store.save_local(f"outputs/{folder_name}")
-            break
-        c1 += 1
-    store.save_local(f"outputs/{folder_name}")
-
-
-def get_user_permission(docs, folder_name):
-    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
-    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
-    # docs_content = (" ".join(docs))
-    docs_content = ""
-    for doc in docs:
-        docs_content += doc.page_content
-
-    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
-    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
-    print(f"Number of Tokens = {format(tokens, ',d')}")
-    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    # Here we check for user permission before calling the API.
-    user_input = input("Price Okay? (Y/N) \n").lower()
-    if user_input == "y":
-        call_openai_api(docs, folder_name)
-    elif user_input == "":
-        call_openai_api(docs, folder_name)
-    else:
-        print("The API was not called. No money was spent.")
--- a/scripts/parser/py2doc.py
+++ b/scripts/parser/py2doc.py
@@ -1,121 +0,0 @@
-import ast
-import os
-from pathlib import Path
-
-import tiktoken
-from langchain_community.llms import OpenAI
-from langchain.prompts import PromptTemplate
-
-
-def find_files(directory):
-    files_list = []
-    for root, dirs, files in os.walk(directory):
-        for file in files:
-            if file.endswith('.py'):
-                files_list.append(os.path.join(root, file))
-    return files_list
-
-
-def extract_functions(file_path):
-    with open(file_path, 'r') as file:
-        source_code = file.read()
-        functions = {}
-        tree = ast.parse(source_code)
-        for node in ast.walk(tree):
-            if isinstance(node, ast.FunctionDef):
-                func_name = node.name
-                func_def = ast.get_source_segment(source_code, node)
-                functions[func_name] = func_def
-    return functions
-
-
-def extract_classes(file_path):
-    with open(file_path, 'r') as file:
-        source_code = file.read()
-        classes = {}
-        tree = ast.parse(source_code)
-        for node in ast.walk(tree):
-            if isinstance(node, ast.ClassDef):
-                class_name = node.name
-                function_names = []
-                for subnode in ast.walk(node):
-                    if isinstance(subnode, ast.FunctionDef):
-                        function_names.append(subnode.name)
-                classes[class_name] = ", ".join(function_names)
-    return classes
-
-
-def extract_functions_and_classes(directory):
-    files = find_files(directory)
-    functions_dict = {}
-    classes_dict = {}
-    for file in files:
-        functions = extract_functions(file)
-        if functions:
-            functions_dict[file] = functions
-        classes = extract_classes(file)
-        if classes:
-            classes_dict[file] = classes
-    return functions_dict, classes_dict
-
-
-def parse_functions(functions_dict, formats, dir):
-    c1 = len(functions_dict)
-    for i, (source, functions) in enumerate(functions_dict.items(), start=1):
-        print(f"Processing file {i}/{c1}")
-        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
-        subfolders = "/".join(source_w.split("/")[:-1])
-        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
-        for j, (name, function) in enumerate(functions.items(), start=1):
-            print(f"Processing function {j}/{len(functions)}")
-            prompt = PromptTemplate(
-                input_variables=["code"],
-                template="Code: \n{code}, \nDocumentation: ",
-            )
-            llm = OpenAI(temperature=0)
-            response = llm(prompt.format(code=function))
-            mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
-            with open(f"outputs/{source_w}", mode) as f:
-                f.write(
-                    f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
-
-
-def parse_classes(classes_dict, formats, dir):
-    c1 = len(classes_dict)
-    for i, (source, classes) in enumerate(classes_dict.items()):
-        print(f"Processing file {i + 1}/{c1}")
-        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
-        subfolders = "/".join(source_w.split("/")[:-1])
-        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
-        for name, function_names in classes.items():
-            print(f"Processing Class {i + 1}/{c1}")
-            prompt = PromptTemplate(
-                input_variables=["class_name", "functions_names"],
-                template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
-            )
-            llm = OpenAI(temperature=0)
-            response = llm(prompt.format(class_name=name, functions_names=function_names))
-
-            with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
-                f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
-
-
-def transform_to_docs(functions_dict, classes_dict, formats, dir):
-    docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
-    docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
-
-    num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(docs_content))
-    total_price = ((num_tokens / 1000) * 0.02)
-
-    print(f"Number of Tokens = {num_tokens:,d}")
-    print(f"Approx Cost = ${total_price:,.2f}")
-
-    user_input = input("Price Okay? (Y/N)\n").lower()
-    if user_input == "y" or user_input == "":
-        if not Path("outputs").exists():
-            Path("outputs").mkdir()
-        parse_functions(functions_dict, formats, dir)
-        parse_classes(classes_dict, formats, dir)
-        print("All done!")
-    else:
-        print("The API was not called. No money was spent.")
--- a/scripts/parser/schema/init.py
+++ b/scripts/parser/schema/init.py
--- a/scripts/parser/schema/base.py
+++ b/scripts/parser/schema/base.py
@@ -1,34 +0,0 @@
-"""Base schema for readers."""
-from dataclasses import dataclass
-
-from langchain.docstore.document import Document as LCDocument
-from parser.schema.schema import BaseDocument
-
-
-@dataclass
-class Document(BaseDocument):
-    """Generic interface for a data document.
-
-    This document connects to data sources.
-
-    """
-
-    def __post_init__(self) -> None:
-        """Post init."""
-        if self.text is None:
-            raise ValueError("text field not set.")
-
-    @classmethod
-    def get_type(cls) -> str:
-        """Get Document type."""
-        return "Document"
-
-    def to_langchain_format(self) -> LCDocument:
-        """Convert struct to LangChain document format."""
-        metadata = self.extra_info or {}
-        return LCDocument(page_content=self.text, metadata=metadata)
-
-    @classmethod
-    def from_langchain_format(cls, doc: LCDocument) -> "Document":
-        """Convert struct from LangChain document format."""
-        return cls(text=doc.page_content, extra_info=doc.metadata)
--- a/scripts/parser/schema/schema.py
+++ b/scripts/parser/schema/schema.py
@@ -1,64 +0,0 @@
-"""Base schema for data structures."""
-from abc import abstractmethod
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
-
-from dataclasses_json import DataClassJsonMixin
-
-
-@dataclass
-class BaseDocument(DataClassJsonMixin):
-    """Base document.
-
-    Generic abstract interfaces that captures both index structs
-    as well as documents.
-
-    """
-
-    # TODO: consolidate fields from Document/IndexStruct into base class
-    text: Optional[str] = None
-    doc_id: Optional[str] = None
-    embedding: Optional[List[float]] = None
-
-    # extra fields
-    extra_info: Optional[Dict[str, Any]] = None
-
-    @classmethod
-    @abstractmethod
-    def get_type(cls) -> str:
-        """Get Document type."""
-
-    def get_text(self) -> str:
-        """Get text."""
-        if self.text is None:
-            raise ValueError("text field not set.")
-        return self.text
-
-    def get_doc_id(self) -> str:
-        """Get doc_id."""
-        if self.doc_id is None:
-            raise ValueError("doc_id not set.")
-        return self.doc_id
-
-    @property
-    def is_doc_id_none(self) -> bool:
-        """Check if doc_id is None."""
-        return self.doc_id is None
-
-    def get_embedding(self) -> List[float]:
-        """Get embedding.
-
-        Errors if embedding is None.
-
-        """
-        if self.embedding is None:
-            raise ValueError("embedding not set.")
-        return self.embedding
-
-    @property
-    def extra_info_str(self) -> Optional[str]:
-        """Extra info string."""
-        if self.extra_info is None:
-            return None
-
-        return "\n".join([f"{k}: {str(v)}" for k, v in self.extra_info.items()])
--- a/scripts/parser/token_func.py
+++ b/scripts/parser/token_func.py
@@ -1,76 +0,0 @@
-import re
-from math import ceil
-from typing import List
-
-import tiktoken
-from parser.schema.base import Document
-
-def separate_header_and_body(text):
-    header_pattern = r"^(.*?\n){3}"
-    match = re.match(header_pattern, text)
-    header = match.group(0)
-    body = text[len(header):]
-    return header, body
-
-
-def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
-    docs = []
-    current_group = None
-
-    for doc in documents:
-        doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
-
-        if current_group is None:
-            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
-                                     extra_info=doc.extra_info)
-        elif len(tiktoken.get_encoding("cl100k_base").encode(
-                current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
-            current_group.text += " " + doc.text
-        else:
-            docs.append(current_group)
-            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
-                                     extra_info=doc.extra_info)
-
-    if current_group is not None:
-        docs.append(current_group)
-
-    return docs
-
-
-def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
-    docs = []
-    for doc in documents:
-        token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
-        if token_length <= max_tokens:
-            docs.append(doc)
-        else:
-            header, body = separate_header_and_body(doc.text)
-            if len(tiktoken.get_encoding("cl100k_base").encode(header)) > max_tokens:
-                body = doc.text
-                header = ""
-            num_body_parts = ceil(token_length / max_tokens)
-            part_length = ceil(len(body) / num_body_parts)
-            body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)]
-            for i, body_part in enumerate(body_parts):
-                new_doc = Document(text=header + body_part.strip(),
-                                   doc_id=f"{doc.doc_id}-{i}",
-                                   embedding=doc.embedding,
-                                   extra_info=doc.extra_info)
-                docs.append(new_doc)
-    return docs
-
-
-def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
-    if not token_check:
-        return documents
-    print("Grouping small documents")
-    try:
-        documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
-    except Exception:
-        print("Grouping failed, try running without token_check")
-    print("Separating large documents")
-    try:
-        documents = split_documents(documents=documents, max_tokens=max_tokens)
-    except Exception:
-        print("Grouping failed, try running without token_check")
-    return documents
--- a/setup.sh
+++ b/setup.sh
@@ -66,7 +66,7 @@ download_locally() {
    # Call the function to check and start Docker if needed
    check_and_start_docker

-    docker-compose -f docker-compose-local.yaml build && docker-compose -f docker-compose-local.yaml up -d
+    docker compose -f deployment/docker-compose-local.yaml build && docker compose -f deployment/docker-compose-local.yaml up -d
    #python -m venv venv
    #source venv/bin/activate
    pip install -r application/requirements.txt
@@ -82,7 +82,7 @@ download_locally() {
    echo "You can stop the application by running the following command:"
    echo "Ctrl + C and then"
    echo "Then pkill -f 'flask run' and then"
-    echo "docker-compose down"
+    echo "docker compose down"
    flask run --host=0.0.0.0 --port=7091 &
    celery -A application.app.celery worker -l INFO
 }
@@ -98,11 +98,11 @@ use_openai() {
    # Call the function to check and start Docker if needed
    check_and_start_docker
    
-    docker-compose build && docker-compose up -d
+    docker compose -f deployment/docker-compose.yaml build && docker compose -f deployment/docker-compose.yaml up -d

    echo "The application will run on http://localhost:5173"
    echo "You can stop the application by running the following command:"
-    echo "docker-compose down"
+    echo "docker compose down"
 }

 use_docsgpt() {
@@ -113,11 +113,11 @@ use_docsgpt() {
    # Call the function to check and start Docker if needed
    check_and_start_docker

-    docker-compose build && docker-compose up -d
+    docker compose -f deployment/docker-compose.yaml build && docker compose -f deployment/docker-compose.yaml up -d

    echo "The application will run on http://localhost:5173"
    echo "You can stop the application by running the following command:"
-    echo "docker-compose down"
+    echo "docker compose down"
 }

 # Prompt the user for their choice