Merge pull request #361 from Ayush-Prabhu/patch-1

Grammatical corrections
Merge pull request #362 from arc53/feature/startup-script-cpu-inference
2025-11-29 16:43:16 +00:00 · 2023-10-01 20:58:09 +01:00 · 2023-10-01 20:11:42 +01:00 · 2023-10-01 20:10:41 +01:00 · 2023-10-01 20:09:15 +01:00 · 2023-10-01 20:05:13 +01:00
215 changed files with 23727 additions and 2845 deletions
--- a/.env-template
+++ b/.env-template
@@ -0,0 +1,9 @@
+OPENAI_API_KEY=<LLM api key (for example, open ai key)>
+SELF_HOSTED_MODEL=false
+VITE_API_STREAMING=true
+
+#For Azure
+OPENAI_API_BASE=
+OPENAI_API_VERSION=
+AZURE_DEPLOYMENT_NAME=
+AZURE_EMBEDDINGS_DEPLOYMENT_NAME=
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,7 +8,12 @@ on:

 jobs:
  deploy:
+    if: github.repository == 'arc53/DocsGPT'
    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
    steps:
      - uses: actions/checkout@v3

@@ -23,17 +28,17 @@ jobs:
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
-          
+
      - name: Login to ghcr.io
        uses: docker/login-action@v2
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GHCR_TOKEN }}
+          password: ${{ secrets.GITHUB_TOKEN }}

      # Runs a single command using the runners shell
      - name: Build and push Docker images to docker.io and ghcr.io
-        uses: docker/build-push-action@v2
+        uses: docker/build-push-action@v4
        with:
          file: './application/Dockerfile'
          platforms: linux/amd64
--- a/.github/workflows/cife.yml
+++ b/.github/workflows/cife.yml
@@ -0,0 +1,48 @@
+name: Build and push DocsGPT-FE Docker image
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Login to ghcr.io
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      # Runs a single command using the runners shell
+      - name: Build and push Docker images to docker.io and ghcr.io
+        uses: docker/build-push-action@v4
+        with:
+          file: './frontend/Dockerfile'
+          platforms: linux/amd64
+          context: ./frontend
+          push: true
+          tags: |
+            ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:latest
+            ghcr.io/${{ github.repository_owner }}/docsgpt-fe:latest
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,17 @@
+name: Python linting
+
+on:
+  push:
+    branches:
+      - '*'
+  pull_request:
+    types: [ opened, synchronize ]
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Lint with Ruff
+        uses: chartboost/ruff-action@v1
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -0,0 +1,30 @@
+name: Run python tests with pytest
+on: [push, pull_request]
+jobs:
+  pytest_and_coverage:
+    name: Run tests and count coverage
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pytest pytest-cov
+          cd application
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Test with pytest and generate coverage report
+        run: |
+          python -m pytest --cov=application --cov=scripts --cov=extensions --cov-report=xml
+      - name: Upload coverage reports to Codecov
+        if: github.event_name == 'pull_request' && matrix.python-version == '3.11'
+        uses: codecov/codecov-action@v3
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+
--- a/.github/workflows/sync_fork.yaml
+++ b/.github/workflows/sync_fork.yaml
@@ -0,0 +1,41 @@
+name: Upstream Sync
+
+permissions:
+  contents: write
+
+on:
+  schedule:
+    - cron: "0 0 * * *" # every hour
+  workflow_dispatch:
+
+jobs:
+  sync_latest_from_upstream:
+    name: Sync latest commits from upstream repo
+    runs-on: ubuntu-latest
+    if: ${{ github.event.repository.fork }}
+
+    steps:
+      # Step 1: run a standard checkout action
+      - name: Checkout target repo
+        uses: actions/checkout@v3
+
+      # Step 2: run the sync action
+      - name: Sync upstream changes
+        id: sync
+        uses: aormsby/Fork-Sync-With-Upstream-action@v3.4
+        with:
+          # set your upstream repo and branch
+          upstream_sync_repo: arc53/DocsGPT
+          upstream_sync_branch: main
+          target_sync_branch: main
+          target_repo_token: ${{ secrets.GITHUB_TOKEN }} # automatically generated, no need to set
+
+          # Set test_mode true to run tests instead of the true action!!
+          test_mode: false
+
+      - name: Sync check
+        if: failure()
+        run: |
+          echo "::error::由于权限不足，导致同步失败（这是预期的行为），请前往仓库首页手动执行[Sync fork]。"
+          echo "::error::Due to insufficient permissions, synchronization failed (as expected). Please go to the repository homepage and manually perform [Sync fork]."
+          exit 1
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,7 @@ __pycache__/

 # C extensions
 *.so
-
+*.next
 # Distribution / packaging
 .Python
 build/
@@ -162,3 +162,13 @@ frontend/*.sw?
 application/vectors/

 **/inputs
+
+**/indexes
+
+**/temp
+
+**/yarn.lock
+
+node_modules/
+.vscode/settings.json
+models/
--- a/.ruff.toml
+++ b/.ruff.toml
@@ -0,0 +1,2 @@
+# Allow lines to be as long as 120 characters.
+line-length = 120
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -6,33 +6,39 @@ Thank you for choosing this project to contribute to, we are all very grateful!

 📣 Discussions - where you can start a new topic or answer some questions

-🐞 Issues - Is how we track tasks, sometimes its bugs that need fixing, sometimes its new features
+🐞 Issues - This is how we track tasks, sometimes it is bugs that need fixing, and sometimes it is new features

-🛠️ Pull requests - Is how you can suggest changes to our repository, to work on existing issue or to add new features
+🛠️ Pull requests - This is how you can suggest changes to our repository, to work on existing issues or add new features

 📚 Wiki - where we have our documentation


 ## 🐞 Issues and Pull requests

-We value contributions to our issues in form of discussion or suggestion, we recommend that you check out existing issues and our [Roadmap](https://github.com/orgs/arc53/projects/2)
+We value contributions to our issues in the form of discussion or suggestion, we recommend that you check out existing issues and our [Roadmap](https://github.com/orgs/arc53/projects/2)

-If you want to contribute by writing code there are few things that you should know before doing it:
+If you want to contribute by writing code there are a few things that you should know before doing it:
 We have frontend (React, Vite) and Backend (python)

 ### If you are looking to contribute to Frontend (⚛️React, Vite):
-Current frontend is being migrated from /application to /frontend with a new design, so please contribute to the new on. Check out this [Milestone](https://github.com/arc53/DocsGPT/milestone/1) and its issues also [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1)
-Please try to follow guidelines
-
+The current frontend is being migrated from /application to /frontend with a new design, so please contribute to the new one. Check out this [Milestone](https://github.com/arc53/DocsGPT/milestone/1) and its issues also [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1)
+Please try to follow the guidelines.

 ### If you are looking to contribute to Backend (🐍Python):
-Check out our issues, and contribute to /application or /scripts (ignore old  ingest_rst.py ingest_rst_sphinx.py files, they will be deprecated soon)
-Currently we don't have any tests(which would be useful😉) but before submitting you PR make sure that after you ingested some test data its queryable
+* Check out our issues, and contribute to /application or /scripts (ignore old  ingest_rst.py ingest_rst_sphinx.py files, they will be deprecated soon)
+* All new code should be covered with unit tests ([pytest](https://github.com/pytest-dev/pytest)). Please find tests under [/tests](https://github.com/arc53/DocsGPT/tree/main/tests) folder.
+* Before submitting your PR make sure that after you ingested some test data it is queryable.
+
+### Testing
+To run unit tests, from the root of the repository execute:
+```
+python -m pytest
+```

 ### Workflow:
-Create a fork, make changes on your forked repository, submit changes in a form of pull request
+Create a fork, make changes on your forked repository, and submit changes in the form of a pull request.

-## Questions / collaboration
+## Questions/collaboration
 Please join our [Discord](https://discord.gg/n5BX8dh8rU) don't hesitate, we are very friendly and welcoming to new contributors.

-# Thank you so much for considering to contribute to DocsGPT!🙏
+# Thank you so much for considering contributing to DocsGPT!🙏
--- a/HACKTOBERFEST.md
+++ b/HACKTOBERFEST.md
@@ -0,0 +1,31 @@
+🎉 Join the Hacktoberfest with DocsGPT and Earn a Free T-shirt! 🎉
+
+Welcome, contributors! We're excited to announce that DocsGPT is participating in Hacktoberfest. Get involved by submitting a **meaningful** pull request, and earn a free shirt in return!
+📜 Here's How to Contribute:
+
+    🛠️ Code: This is the golden ticket! Make meaningful contributions through PRs.
+    📚 Wiki: Improve our documentation, Create a guide or change existing documentation.
+    🖥️ Design: Improve the UI/UX, or design a new feature.
+
+📝 Guidelines for Pull Requests:
+
+Familiarize yourself with the current contributions and our [Roadmap](https://github.com/orgs/arc53/projects/2).
+
+Deciding to contribute with code? Here are some insights based on the area of your interest:
+
+Frontend (⚛️React, Vite):
+    Most of the code is located in /frontend folder. You can also check out our React extension in /extensions/react-widget.
+    For design references, here's the [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1).
+    Ensure you adhere to the established guidelines.
+
+Backend (🐍Python):
+    Focus on /application or /scripts. However, avoid the files ingest_rst.py and ingest_rst_sphinx.py as they are soon to be deprecated.
+    Newly added code should come with relevant unit tests (pytest).
+    Refer to the /tests folder for test suites.
+
+Check out [Contributing Guidelines](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md)
+
+
+Don't be shy! Hop into our [Discord](https://discord.gg/n5BX8dh8rU) Server. We're a friendly bunch and eager to assist newcomers.
+
+Big thanks for considering contributing to DocsGPT during Hacktoberfest! 🙏 Your effort can earn you a swanky new t-shirt. 🎁 Let's code together! 🚀
--- a/README.md
+++ b/README.md
@@ -18,54 +18,139 @@ Say goodbye to time-consuming manual searches, and let <strong>DocsGPT</strong>
  <a href="https://discord.gg/n5BX8dh8rU">![example2](https://img.shields.io/github/forks/arc53/docsgpt?style=social)</a>
  <a href="https://discord.gg/n5BX8dh8rU">![example3](https://img.shields.io/github/license/arc53/docsgpt)</a>
  <a href="https://discord.gg/n5BX8dh8rU">![example3](https://img.shields.io/discord/1070046503302877216)</a>
+
+
  
 </div>

-![Group 9](https://user-images.githubusercontent.com/17906039/220427472-2644cff4-7666-46a5-819f-fc4a521f63c7.png)
+### Enterprise Solutions: 

+When deploying your DocsGPT to a live environment, we're eager to provide personalized assistance. Reach out to us via email [here]( mailto:contact@arc53.com?subject=DocsGPT%20Enterprise&body=Hi%20we%20are%20%3CCompany%20name%3E%20and%20we%20want%20to%20build%20%3CSolution%3E%20with%20DocsGPT) to discuss your project further, and our team will connect with you shortly.
+
+### [🎉 Join the Hacktoberfest with DocsGPT and Earn a Free T-shirt! 🎉](https://github.com/arc53/DocsGPT/blob/main/HACKTOBERFEST.md)
+
+![video-example-of-docs-gpt](https://d3dg1063dc54p9.cloudfront.net/videos/demov3.gif)


 ## Roadmap

-You can find our [Roadmap](https://github.com/orgs/arc53/projects/2) here, please don't hesitate contributing or creating issues, it helps us make DocsGPT better!
+You can find our [Roadmap](https://github.com/orgs/arc53/projects/2) here. Please don't hesitate to contribute or create issues, it helps us make DocsGPT better!

-## Preview
-![video-example-of-docs-gpt](https://d3dg1063dc54p9.cloudfront.net/videos/demov2.gif)
+## Our open source models optimised for DocsGPT:

-## [Live preview](https://docsgpt.arc53.com/)
+| Name              | Base Model | Requirements (or similar)                        |
+|-------------------|------------|----------------------------------------------------------|
+| [Docsgpt-7b-falcon](https://huggingface.co/Arc53/docsgpt-7b-falcon)  | Falcon-7b  |  1xA10G gpu   |
+| [Docsgpt-14b](https://huggingface.co/Arc53/docsgpt-14b)              | llama-2-14b    | 2xA10 gpu's   |
+| [Docsgpt-40b-falcon](https://huggingface.co/Arc53/docsgpt-40b-falcon)       | falcon-40b     | 8xA10G gpu's  |

-## [Join Our Discord](https://discord.gg/n5BX8dh8rU)
+
+If you don't have enough resources to run it you can use bitsnbytes to quantize
+
+
+## Features
+
+![Group 9](https://user-images.githubusercontent.com/17906039/220427472-2644cff4-7666-46a5-819f-fc4a521f63c7.png)
+
+
+## Useful links
+ [Live preview](https://docsgpt.arc53.com/)
+ 
+ [Join Our Discord](https://discord.gg/n5BX8dh8rU)
+ 
+ [Guides](https://docs.docsgpt.co.uk/)
+
+ [Interested in contributing?](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md)
+
+ [How to use any other documentation](https://docs.docsgpt.co.uk/Guides/How-to-train-on-other-documentation)
+
+ [How to host it locally (so all data will stay on-premises)](https://docs.docsgpt.co.uk/Guides/How-to-use-different-LLM)


 ## Project structure
- Application - flask app (main application)
+- Application - Flask app (main application)

- Extensions - chrome extension
+- Extensions - Chrome extension

- Scripts - script that creates similarity search index and store for other libraries. 
+- Scripts - Script that creates similarity search index and store for other libraries. 
+
+- Frontend - Frontend uses Vite and React

 ## QuickStart
-Please note: current vector database uses pandas Python documentation, thus responses will be related to it, if you want to use other docs please follow a guide below

-1. Navigate to `/application` folder
+Note: Make sure you have Docker installed
+
+On Mac OS or Linux just write:
+
+`./setup.sh`
+
+It will install all the dependencies and give you an option to download local model or use OpenAI
+
+Otherwise refer to this Guide:
+
+1. Download and open this repository with `git clone https://github.com/arc53/DocsGPT.git`
+2. Create a .env file in your root directory and set the env variable OPENAI_API_KEY with your OpenAI API key and  VITE_API_STREAMING to true or false, depending on if you want streaming answers or not
+   It should look like this inside:
+   
+   ```
+   OPENAI_API_KEY=Yourkey
+   VITE_API_STREAMING=true
+   SELF_HOSTED_MODEL=false
+   ```
+   See optional environment variables in the `/.env-template` and `/application/.env_sample` files.
+3. Run `./run-with-docker-compose.sh`
+4. Navigate to http://localhost:5173/
+
+To stop just run Ctrl + C
+
+## Development environments
+
+### Spin up mongo and redis
+For development only 2 containers are used from docker-compose.yaml (by deleting all services except for Redis and Mongo). 
+See file [docker-compose-dev.yaml](./docker-compose-dev.yaml).
+
+Run
+```
+docker compose -f docker-compose-dev.yaml build
+docker compose -f docker-compose-dev.yaml up -d
+```
+
+### Run the backend
+
+Make sure you have Python 3.10 or 3.11 installed.
+
+1. Export required environment variables
+```commandline
+export CELERY_BROKER_URL=redis://localhost:6379/0   
+export CELERY_RESULT_BACKEND=redis://localhost:6379/1
+export MONGO_URI=mongodb://localhost:27017/docsgpt
+export FLASK_APP=application/app.py
+export FLASK_DEBUG=true
+```
+2. Prepare .env file
+Copy `.env_sample` and create `.env` with your OpenAI API token
+3. (optional) Create a Python virtual environment
+```commandline
+python -m venv venv
+. venv/bin/activate
+```
+4. Change to `application/` subdir and install dependencies for the backend
+```commandline
+pip install -r application/requirements.txt
+```
+5. Run the app `flask run --host=0.0.0.0 --port=7091`
+6. Start worker with `celery -A application.app.celery worker -l INFO`
+
+### Start frontend 
+Make sure you have Node version 16 or higher.
+
+1. Navigate to `/frontend` folder
 2. Install dependencies
-`pip install -r requirements.txt`
-3. Prepare .env file
-Copy .env_sample and create .env with your openai api token
-4. Run the app
-`python app.py`
+`npm install`
+3. Run the app 
+`npm run dev`


-[How to install the Chrome extension](https://github.com/arc53/docsgpt/wiki#launch-chrome-extension)
-
-
-## [Guides](https://github.com/arc53/docsgpt/wiki)
-
-## [Interested in contributing?](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md)
-
-## [How to use any other documentation](https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation)
-
-## [How to host it locally (so all data will stay on-premises)](https://github.com/arc53/DocsGPT/wiki/How-to-use-different-LLM's#hosting-everything-locally)

 Built with [🦜️🔗 LangChain](https://github.com/hwchase17/langchain)

--- a/application/.env_sample
+++ b/application/.env_sample
@@ -1 +1,12 @@
-OPENAI_API_KEY=your_api_key
+API_KEY=your_api_key
+EMBEDDINGS_KEY=your_api_key
+CELERY_BROKER_URL=redis://localhost:6379/0
+CELERY_RESULT_BACKEND=redis://localhost:6379/1
+MONGO_URI=mongodb://localhost:27017/docsgpt
+API_URL=http://localhost:7091
+
+#For OPENAI on Azure
+OPENAI_API_BASE=
+OPENAI_API_VERSION=
+AZURE_DEPLOYMENT_NAME=
+AZURE_EMBEDDINGS_DEPLOYMENT_NAME=
--- a/application/Dockerfile
+++ b/application/Dockerfile
@@ -4,17 +4,20 @@ FROM python:3.10-slim-bullseye as builder
 RUN apt-get update && apt-get install -y gcc curl
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && apt-get install --reinstall libc6-dev -y
 ENV PATH="/root/.cargo/bin:${PATH}"
-RUN pip install --upgrade pip && pip install tiktoken==0.1.2
-
-FROM python:3.10-slim-bullseye
-# Copy pre-built packages from builder stage
-COPY --from=builder /usr/local/lib/python3.10/site-packages/ /usr/local/lib/python3.10/site-packages/
-WORKDIR /app
-COPY . /app
-ENV FLASK_APP=app.py
-ENV FLASK_DEBUG=true
+RUN pip install --upgrade pip && pip install tiktoken==0.3.3
+COPY requirements.txt .
 RUN pip install -r requirements.txt

-EXPOSE 5001
+FROM python:3.10-slim-bullseye

-CMD ["gunicorn", "-w", "6", "--bind", "0.0.0.0:5001", "wsgi:app"]
+# Copy pre-built packages and binaries from builder stage
+COPY --from=builder /usr/local/ /usr/local/
+
+WORKDIR /app
+COPY . /app/application
+ENV FLASK_APP=app.py
+ENV FLASK_DEBUG=true
+
+EXPOSE 7091
+
+CMD ["gunicorn", "-w", "2", "--timeout", "120", "--bind", "0.0.0.0:7091", "application.wsgi:app"]
--- a/application/init.py
+++ b/application/init.py
--- a/application/api/init.py
+++ b/application/api/init.py
--- a/application/api/answer/init.py
+++ b/application/api/answer/init.py
--- a/application/api/answer/routes.py
+++ b/application/api/answer/routes.py
@@ -0,0 +1,337 @@
+import asyncio
+import os
+from flask import Blueprint, request, Response
+import json
+import datetime
+import logging
+import traceback
+
+from pymongo import MongoClient
+from bson.objectid import ObjectId
+from transformers import GPT2TokenizerFast
+
+
+
+from application.core.settings import settings
+from application.vectorstore.vector_creator import VectorCreator
+from application.llm.llm_creator import LLMCreator
+from application.error import bad_request
+
+
+
+logger = logging.getLogger(__name__)
+
+mongo = MongoClient(settings.MONGO_URI)
+db = mongo["docsgpt"]
+conversations_collection = db["conversations"]
+vectors_collection = db["vectors"]
+answer = Blueprint('answer', __name__)
+
+if settings.LLM_NAME == "gpt4":
+    gpt_model = 'gpt-4'
+else:
+    gpt_model = 'gpt-3.5-turbo'
+
+# load the prompts
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+with open(os.path.join(current_dir, "prompts", "combine_prompt.txt"), "r") as f:
+    template = f.read()
+
+with open(os.path.join(current_dir, "prompts", "combine_prompt_hist.txt"), "r") as f:
+    template_hist = f.read()
+
+with open(os.path.join(current_dir, "prompts", "question_prompt.txt"), "r") as f:
+    template_quest = f.read()
+
+with open(os.path.join(current_dir, "prompts", "chat_combine_prompt.txt"), "r") as f:
+    chat_combine_template = f.read()
+
+with open(os.path.join(current_dir, "prompts", "chat_reduce_prompt.txt"), "r") as f:
+    chat_reduce_template = f.read()
+
+api_key_set = settings.API_KEY is not None
+embeddings_key_set = settings.EMBEDDINGS_KEY is not None
+
+
+async def async_generate(chain, question, chat_history):
+    result = await chain.arun({"question": question, "chat_history": chat_history})
+    return result
+
+
+def count_tokens(string):
+    tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
+    return len(tokenizer(string)['input_ids'])
+
+
+def run_async_chain(chain, question, chat_history):
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    result = {}
+    try:
+        answer = loop.run_until_complete(async_generate(chain, question, chat_history))
+    finally:
+        loop.close()
+    result["answer"] = answer
+    return result
+
+
+def get_vectorstore(data):
+    if "active_docs" in data:
+        if data["active_docs"].split("/")[0] == "local":
+            if data["active_docs"].split("/")[1] == "default":
+                vectorstore = ""
+            else:
+                vectorstore = "indexes/" + data["active_docs"]
+        else:
+            vectorstore = "vectors/" + data["active_docs"]
+        if data["active_docs"] == "default":
+            vectorstore = ""
+    else:
+        vectorstore = ""
+    vectorstore = os.path.join("application", vectorstore)
+    return vectorstore
+
+
+# def get_docsearch(vectorstore, embeddings_key):
+#     if settings.EMBEDDINGS_NAME == "openai_text-embedding-ada-002":
+#         if is_azure_configured():
+#             os.environ["OPENAI_API_TYPE"] = "azure"
+#             openai_embeddings = OpenAIEmbeddings(model=settings.AZURE_EMBEDDINGS_DEPLOYMENT_NAME)
+#         else:
+#             openai_embeddings = OpenAIEmbeddings(openai_api_key=embeddings_key)
+#         docsearch = FAISS.load_local(vectorstore, openai_embeddings)
+#     elif settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2":
+#         docsearch = FAISS.load_local(vectorstore, HuggingFaceHubEmbeddings())
+#     elif settings.EMBEDDINGS_NAME == "huggingface_hkunlp/instructor-large":
+#         docsearch = FAISS.load_local(vectorstore, HuggingFaceInstructEmbeddings())
+#     elif settings.EMBEDDINGS_NAME == "cohere_medium":
+#         docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key))
+#     return docsearch
+
+
+def is_azure_configured():
+    return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
+
+
+def complete_stream(question, docsearch, chat_history, api_key, conversation_id):
+    llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=api_key)
+    
+
+    docs = docsearch.search(question, k=2)
+    if settings.LLM_NAME == "llama.cpp":
+        docs = [docs[0]]
+    # join all page_content together with a newline
+    docs_together = "\n".join([doc.page_content for doc in docs])
+    p_chat_combine = chat_combine_template.replace("{summaries}", docs_together)
+    messages_combine = [{"role": "system", "content": p_chat_combine}]
+    source_log_docs = []
+    for doc in docs:
+        if doc.metadata:
+            data = json.dumps({"type": "source", "doc": doc.page_content, "metadata": doc.metadata})
+            source_log_docs.append({"title": doc.metadata['title'].split('/')[-1], "text": doc.page_content})
+        else:
+            data = json.dumps({"type": "source", "doc": doc.page_content})
+            source_log_docs.append({"title": doc.page_content, "text": doc.page_content})
+        yield f"data:{data}\n\n"
+
+    if len(chat_history) > 1:
+        tokens_current_history = 0
+        # count tokens in history
+        chat_history.reverse()
+        for i in chat_history:
+            if "prompt" in i and "response" in i:
+                tokens_batch = count_tokens(i["prompt"]) + count_tokens(i["response"])
+                if tokens_current_history + tokens_batch < settings.TOKENS_MAX_HISTORY:
+                    tokens_current_history += tokens_batch
+                    messages_combine.append({"role": "user", "content": i["prompt"]})
+                    messages_combine.append({"role": "system", "content": i["response"]})
+    messages_combine.append({"role": "user", "content": question})
+
+    response_full = ""
+    completion = llm.gen_stream(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME,
+                                messages=messages_combine)
+    for line in completion:
+        data = json.dumps({"answer": str(line)})
+        response_full += str(line)
+        yield f"data: {data}\n\n"
+
+    # save conversation to database
+    if conversation_id is not None:
+        conversations_collection.update_one(
+            {"_id": ObjectId(conversation_id)},
+            {"$push": {"queries": {"prompt": question, "response": response_full, "sources": source_log_docs}}},
+        )
+
+    else:
+        # create new conversation
+        # generate summary
+        messages_summary = [{"role": "assistant", "content": "Summarise following conversation in no more than 3 "
+                                                             "words, respond ONLY with the summary, use the same "
+                                                             "language as the system \n\nUser: " + question + "\n\n" +
+                                                             "AI: " +
+                                                             response_full},
+                            {"role": "user", "content": "Summarise following conversation in no more than 3 words, "
+                                                        "respond ONLY with the summary, use the same language as the "
+                                                        "system"}]
+
+        completion = llm.gen(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME,
+                             messages=messages_summary, max_tokens=30)
+        conversation_id = conversations_collection.insert_one(
+            {"user": "local",
+             "date": datetime.datetime.utcnow(),
+             "name": completion,
+             "queries": [{"prompt": question, "response": response_full, "sources": source_log_docs}]}
+        ).inserted_id
+
+    # send data.type = "end" to indicate that the stream has ended as json
+    data = json.dumps({"type": "id", "id": str(conversation_id)})
+    yield f"data: {data}\n\n"
+    data = json.dumps({"type": "end"})
+    yield f"data: {data}\n\n"
+
+
+@answer.route("/stream", methods=["POST"])
+def stream():
+    data = request.get_json()
+    # get parameter from url question
+    question = data["question"]
+    history = data["history"]
+    # history to json object from string
+    history = json.loads(history)
+    conversation_id = data["conversation_id"]
+
+    # check if active_docs is set
+
+    if not api_key_set:
+        api_key = data["api_key"]
+    else:
+        api_key = settings.API_KEY
+    if not embeddings_key_set:
+        embeddings_key = data["embeddings_key"]
+    else:
+        embeddings_key = settings.EMBEDDINGS_KEY
+    if "active_docs" in data:
+        vectorstore = get_vectorstore({"active_docs": data["active_docs"]})
+    else:
+        vectorstore = ""
+    docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key)
+
+    return Response(
+        complete_stream(question, docsearch,
+                        chat_history=history, api_key=api_key,
+                        conversation_id=conversation_id), mimetype="text/event-stream"
+    )
+
+
+@answer.route("/api/answer", methods=["POST"])
+def api_answer():
+    data = request.get_json()
+    question = data["question"]
+    history = data["history"]
+    if "conversation_id" not in data:
+        conversation_id = None
+    else:
+        conversation_id = data["conversation_id"]
+    print("-" * 5)
+    if not api_key_set:
+        api_key = data["api_key"]
+    else:
+        api_key = settings.API_KEY
+    if not embeddings_key_set:
+        embeddings_key = data["embeddings_key"]
+    else:
+        embeddings_key = settings.EMBEDDINGS_KEY
+
+    # use try and except  to check for exception
+    try:
+        # check if the vectorstore is set
+        vectorstore = get_vectorstore(data)
+        # loading the index and the store and the prompt template
+        # Note if you have used other embeddings than OpenAI, you need to change the embeddings
+        docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key)
+
+
+        llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=api_key)
+
+
+
+        docs = docsearch.search(question, k=2)
+        # join all page_content together with a newline
+        docs_together = "\n".join([doc.page_content for doc in docs])
+        p_chat_combine = chat_combine_template.replace("{summaries}", docs_together)
+        messages_combine = [{"role": "system", "content": p_chat_combine}]
+        source_log_docs = []
+        for doc in docs:
+            if doc.metadata:
+                source_log_docs.append({"title": doc.metadata['title'].split('/')[-1], "text": doc.page_content})
+            else:
+                source_log_docs.append({"title": doc.page_content, "text": doc.page_content})
+        # join all page_content together with a newline
+
+
+        if len(history) > 1:
+            tokens_current_history = 0
+            # count tokens in history
+            history.reverse()
+            for i in history:
+                if "prompt" in i and "response" in i:
+                    tokens_batch = count_tokens(i["prompt"]) + count_tokens(i["response"])
+                    if tokens_current_history + tokens_batch < settings.TOKENS_MAX_HISTORY:
+                        tokens_current_history += tokens_batch
+                        messages_combine.append({"role": "user", "content": i["prompt"]})
+                        messages_combine.append({"role": "system", "content": i["response"]})
+        messages_combine.append({"role": "user", "content": question})
+
+
+        completion = llm.gen(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME,
+                                    messages=messages_combine)
+
+
+        result = {"answer": completion, "sources": source_log_docs}
+        logger.debug(result)
+
+        # generate conversationId
+        if conversation_id is not None:
+            conversations_collection.update_one(
+                {"_id": ObjectId(conversation_id)},
+                {"$push": {"queries": {"prompt": question,
+                                       "response": result["answer"], "sources": result['sources']}}},
+            )
+
+        else:
+            # create new conversation
+            # generate summary
+            messages_summary = [
+                {"role": "assistant", "content": "Summarise following conversation in no more than 3 words, "
+                    "respond ONLY with the summary, use the same language as the system \n\n"
+                    "User: " + question + "\n\n" + "AI: " + result["answer"]},
+                {"role": "user", "content": "Summarise following conversation in no more than 3 words, "
+                    "respond ONLY with the summary, use the same language as the system"}
+            ]
+
+            completion = llm.gen(
+                model=gpt_model,
+                engine=settings.AZURE_DEPLOYMENT_NAME,
+                messages=messages_summary,
+                max_tokens=30
+            )
+            conversation_id = conversations_collection.insert_one(
+                {"user": "local",
+                "date": datetime.datetime.utcnow(),
+                "name": completion,
+                "queries": [{"prompt": question, "response": result["answer"], "sources": source_log_docs}]}
+            ).inserted_id
+
+        result["conversation_id"] = str(conversation_id)
+
+        # mock result
+        # result = {
+        #     "answer": "The answer is 42",
+        #     "sources": ["https://en.wikipedia.org/wiki/42_(number)", "https://en.wikipedia.org/wiki/42_(number)"]
+        # }
+        return result
+    except Exception as e:
+        # print whole traceback
+        traceback.print_exc()
+        print(str(e))
+        return bad_request(500, str(e))
--- a/application/api/internal/init.py
+++ b/application/api/internal/init.py
--- a/application/api/internal/routes.py
+++ b/application/api/internal/routes.py
@@ -0,0 +1,69 @@
+import os
+import datetime
+from flask import Blueprint, request, send_from_directory
+from pymongo import MongoClient
+from werkzeug.utils import secure_filename
+
+
+from application.core.settings import settings
+mongo = MongoClient(settings.MONGO_URI)
+db = mongo["docsgpt"]
+conversations_collection = db["conversations"]
+vectors_collection = db["vectors"]
+
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+internal = Blueprint('internal', __name__)
+@internal.route("/api/download", methods=["get"])
+def download_file():
+    user = secure_filename(request.args.get("user"))
+    job_name = secure_filename(request.args.get("name"))
+    filename = secure_filename(request.args.get("file"))
+    save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name)
+    return send_from_directory(save_dir, filename, as_attachment=True)
+
+
+
+@internal.route("/api/upload_index", methods=["POST"])
+def upload_index_files():
+    """Upload two files(index.faiss, index.pkl) to the user's folder."""
+    if "user" not in request.form:
+        return {"status": "no user"}
+    user = secure_filename(request.form["user"])
+    if "name" not in request.form:
+        return {"status": "no name"}
+    job_name = secure_filename(request.form["name"])
+    save_dir = os.path.join(current_dir, "indexes", user, job_name)
+    if settings.VECTOR_STORE == "faiss":
+        if "file_faiss" not in request.files:
+            print("No file part")
+            return {"status": "no file"}
+        file_faiss = request.files["file_faiss"]
+        if file_faiss.filename == "":
+            return {"status": "no file name"}
+        if "file_pkl" not in request.files:
+            print("No file part")
+            return {"status": "no file"}
+        file_pkl = request.files["file_pkl"]
+        if file_pkl.filename == "":
+            return {"status": "no file name"}
+        # saves index files
+        
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        file_faiss.save(os.path.join(save_dir, "index.faiss"))
+        file_pkl.save(os.path.join(save_dir, "index.pkl"))
+    # create entry in vectors_collection
+    vectors_collection.insert_one(
+        {
+            "user": user,
+            "name": job_name,
+            "language": job_name,
+            "location": save_dir,
+            "date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
+            "model": settings.EMBEDDINGS_NAME,
+            "type": "local",
+        }
+    )
+    return {"status": "ok"}
--- a/application/api/user/init.py
+++ b/application/api/user/init.py
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -0,0 +1,226 @@
+import os
+from flask import Blueprint, request, jsonify
+import requests
+import json
+from pymongo import MongoClient
+from bson.objectid import ObjectId
+from werkzeug.utils import secure_filename
+import http.client
+
+from application.api.user.tasks import ingest
+
+from application.core.settings import settings
+from application.vectorstore.vector_creator import VectorCreator
+
+mongo = MongoClient(settings.MONGO_URI)
+db = mongo["docsgpt"]
+conversations_collection = db["conversations"]
+vectors_collection = db["vectors"]
+user = Blueprint('user', __name__)
+
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+@user.route("/api/delete_conversation", methods=["POST"])
+def delete_conversation():
+    # deletes a conversation from the database
+    conversation_id = request.args.get("id")
+    # write to mongodb
+    conversations_collection.delete_one(
+        {
+            "_id": ObjectId(conversation_id),
+        }
+    )
+
+    return {"status": "ok"}
+
+@user.route("/api/get_conversations", methods=["get"])
+def get_conversations():
+    # provides a list of conversations
+    conversations = conversations_collection.find().sort("date", -1)
+    list_conversations = []
+    for conversation in conversations:
+        list_conversations.append({"id": str(conversation["_id"]), "name": conversation["name"]})
+
+    #list_conversations = [{"id": "default", "name": "default"}, {"id": "jeff", "name": "jeff"}]
+
+    return jsonify(list_conversations)
+
+
+@user.route("/api/get_single_conversation", methods=["get"])
+def get_single_conversation():
+    # provides data for a conversation
+    conversation_id = request.args.get("id")
+    conversation = conversations_collection.find_one({"_id": ObjectId(conversation_id)})
+    return jsonify(conversation['queries'])
+
+
+@user.route("/api/feedback", methods=["POST"])
+def api_feedback():
+    data = request.get_json()
+    question = data["question"]
+    answer = data["answer"]
+    feedback = data["feedback"]
+
+    print("-" * 5)
+    print("Question: " + question)
+    print("Answer: " + answer)
+    print("Feedback: " + feedback)
+    print("-" * 5)
+    response = requests.post(
+        url="https://86x89umx77.execute-api.eu-west-2.amazonaws.com/docsgpt-feedback",
+        headers={
+            "Content-Type": "application/json; charset=utf-8",
+        },
+        data=json.dumps({"answer": answer, "question": question, "feedback": feedback}),
+    )
+    return {"status": http.client.responses.get(response.status_code, "ok")}
+
+
+@user.route("/api/delete_old", methods=["get"])
+def delete_old():
+    """Delete old indexes."""
+    import shutil
+
+    path = request.args.get("path")
+    dirs = path.split("/")
+    dirs_clean = []
+    for i in range(1, len(dirs)):
+        dirs_clean.append(secure_filename(dirs[i]))
+    # check that path strats with indexes or vectors
+    if dirs[0] not in ["indexes", "vectors"]:
+        return {"status": "error"}
+    path_clean = "/".join(dirs)
+    vectors_collection.delete_one({"location": path})
+    if settings.VECTOR_STORE == "faiss":
+        try:
+            shutil.rmtree(os.path.join(current_dir, path_clean))
+        except FileNotFoundError:
+            pass
+    else:
+        vetorstore = VectorCreator.create_vectorstore(
+            settings.VECTOR_STORE, path=os.path.join(current_dir, path_clean)
+        )
+        vetorstore.delete_index()
+        
+    return {"status": "ok"}
+
+@user.route("/api/upload", methods=["POST"])
+def upload_file():
+    """Upload a file to get vectorized and indexed."""
+    if "user" not in request.form:
+        return {"status": "no user"}
+    user = secure_filename(request.form["user"])
+    if "name" not in request.form:
+        return {"status": "no name"}
+    job_name = secure_filename(request.form["name"])
+    # check if the post request has the file part
+    if "file" not in request.files:
+        print("No file part")
+        return {"status": "no file"}
+    file = request.files["file"]
+    if file.filename == "":
+        return {"status": "no file name"}
+
+    if file:
+        filename = secure_filename(file.filename)
+        # save dir
+        save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name)
+        # create dir if not exists
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+
+        file.save(os.path.join(save_dir, filename))
+        task = ingest.delay(settings.UPLOAD_FOLDER, [".rst", ".md", ".pdf", ".txt"], job_name, filename, user)
+        # task id
+        task_id = task.id
+        return {"status": "ok", "task_id": task_id}
+    else:
+        return {"status": "error"}
+
+@user.route("/api/task_status", methods=["GET"])
+def task_status():
+    """Get celery job status."""
+    task_id = request.args.get("task_id")
+    from application.celery import celery
+    task = celery.AsyncResult(task_id)
+    task_meta = task.info
+    return {"status": task.status, "result": task_meta}
+
+
+@user.route("/api/combine", methods=["GET"])
+def combined_json():
+    user = "local"
+    """Provide json file with combined available indexes."""
+    # get json from https://d3dg1063dc54p9.cloudfront.net/combined.json
+
+    data = [
+        {
+            "name": "default",
+            "language": "default",
+            "version": "",
+            "description": "default",
+            "fullName": "default",
+            "date": "default",
+            "docLink": "default",
+            "model": settings.EMBEDDINGS_NAME,
+            "location": "local",
+        }
+    ]
+    # structure: name, language, version, description, fullName, date, docLink
+    # append data from vectors_collection
+    for index in vectors_collection.find({"user": user}):
+        data.append(
+            {
+                "name": index["name"],
+                "language": index["language"],
+                "version": "",
+                "description": index["name"],
+                "fullName": index["name"],
+                "date": index["date"],
+                "docLink": index["location"],
+                "model": settings.EMBEDDINGS_NAME,
+                "location": "local",
+            }
+        )
+    if settings.VECTOR_STORE == "faiss":
+        data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json()
+        for index in data_remote:
+            index["location"] = "remote"
+            data.append(index)
+
+    return jsonify(data)
+
+
+@user.route("/api/docs_check", methods=["POST"])
+def check_docs():
+    # check if docs exist in a vectorstore folder
+    data = request.get_json()
+    # split docs on / and take first part
+    if data["docs"].split("/")[0] == "local":
+        return {"status": "exists"}
+    vectorstore = "vectors/" + data["docs"]
+    base_path = "https://raw.githubusercontent.com/arc53/DocsHUB/main/"
+    if os.path.exists(vectorstore) or data["docs"] == "default":
+        return {"status": "exists"}
+    else:
+        r = requests.get(base_path + vectorstore + "index.faiss")
+
+        if r.status_code != 200:
+            return {"status": "null"}
+        else:
+            if not os.path.exists(vectorstore):
+                os.makedirs(vectorstore)
+            with open(vectorstore + "index.faiss", "wb") as f:
+                f.write(r.content)
+
+            # download the store
+            r = requests.get(base_path + vectorstore + "index.pkl")
+            with open(vectorstore + "index.pkl", "wb") as f:
+                f.write(r.content)
+
+        return {"status": "loaded"}
+
+
+
+
+
--- a/application/api/user/tasks.py
+++ b/application/api/user/tasks.py
@@ -0,0 +1,7 @@
+from application.worker import ingest_worker
+from application.celery import celery
+
+@celery.task(bind=True)
+def ingest(self, directory, formats, name_job, filename, user):
+    resp = ingest_worker(self, directory, formats, name_job, filename, user)
+    return resp
--- a/application/app.py
+++ b/application/app.py
@@ -1,42 +1,19 @@
-import os
-import json
-import traceback
+import platform
+

 import dotenv
-import requests
-from flask import Flask, request, render_template
-from langchain import FAISS
-from langchain.llms import OpenAIChat
-from langchain import VectorDBQA, HuggingFaceHub, Cohere, OpenAI
-from langchain.chains.question_answering import load_qa_chain
-from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings, CohereEmbeddings, \
-    HuggingFaceInstructEmbeddings
-from langchain.prompts import PromptTemplate
-from error import bad_request
+from application.celery import celery
+from flask import Flask, request, redirect

-os.environ["LANGCHAIN_HANDLER"] = "langchain"

-if os.getenv("LLM_NAME") is not None:
-    llm_choice = os.getenv("LLM_NAME")
-else:
-    llm_choice = "openai"
+from application.core.settings import settings
+from application.api.user.routes import user
+from application.api.answer.routes import answer
+from application.api.internal.routes import internal

-if os.getenv("EMBEDDINGS_NAME") is not None:
-    embeddings_choice = os.getenv("EMBEDDINGS_NAME")
-else:
-    embeddings_choice = "openai_text-embedding-ada-002"

-if llm_choice == "manifest":
-    from manifest import Manifest
-    from langchain.llms.manifest import ManifestWrapper
-
-    manifest = Manifest(
-        client_name="huggingface",
-        client_connection="http://127.0.0.1:5000"
-    )

 # Redirect PosixPath to WindowsPath on Windows
-import platform

 if platform.system() == "Windows":
    import pathlib
@@ -47,156 +24,45 @@ if platform.system() == "Windows":
 # loading the .env file
 dotenv.load_dotenv()

-with open("combine_prompt.txt", "r") as f:
-    template = f.read()

-with open("combine_prompt_hist.txt", "r") as f:
-    template_hist = f.read()
-
-with open("question_prompt.txt", "r") as f:
-    template_quest = f.read()
-
-if os.getenv("API_KEY") is not None:
-    api_key_set = True
-else:
-    api_key_set = False
-if os.getenv("EMBEDDINGS_KEY") is not None:
-    embeddings_key_set = True
-else:
-    embeddings_key_set = False

 app = Flask(__name__)
+app.register_blueprint(user)
+app.register_blueprint(answer)
+app.register_blueprint(internal)
+app.config["UPLOAD_FOLDER"] = UPLOAD_FOLDER = "inputs"
+app.config["CELERY_BROKER_URL"] = settings.CELERY_BROKER_URL
+app.config["CELERY_RESULT_BACKEND"] = settings.CELERY_RESULT_BACKEND
+app.config["MONGO_URI"] = settings.MONGO_URI
+celery.config_from_object("application.celeryconfig")
+


@app.route("/")
 def home():
-    return render_template("index.html", api_key_set=api_key_set, llm_choice=llm_choice,
-                           embeddings_choice=embeddings_choice)
-
-
-@app.route("/api/answer", methods=["POST"])
-def api_answer():
-    data = request.get_json()
-    question = data["question"]
-    history = data["history"]
-    print('-' * 5)
-    if not api_key_set:
-        api_key = data["api_key"]
+    """
+    The frontend source code lives in the /frontend directory of the repository.
+    """
+    if request.remote_addr in ('0.0.0.0', '127.0.0.1', 'localhost', '172.18.0.1'):
+        # If users locally try to access DocsGPT running in Docker,
+        # they will be redirected to the Frontend application.
+        return redirect('http://localhost:5173')
    else:
-        api_key = os.getenv("API_KEY")
-    if not embeddings_key_set:
-        embeddings_key = data["embeddings_key"]
-    else:
-        embeddings_key = os.getenv("EMBEDDINGS_KEY")
-
-    # use try and except  to check for exception
-    try:
-        # check if the vectorstore is set
-        if "active_docs" in data:
-            vectorstore = "vectors/" + data["active_docs"]
-            if data['active_docs'] == "default":
-                vectorstore = ""
-        else:
-            vectorstore = ""
-        #vectorstore = "outputs/inputs/"
-        # loading the index and the store and the prompt template
-        # Note if you have used other embeddings than OpenAI, you need to change the embeddings
-        if embeddings_choice == "openai_text-embedding-ada-002":
-            docsearch = FAISS.load_local(vectorstore, OpenAIEmbeddings(openai_api_key=embeddings_key))
-        elif embeddings_choice == "huggingface_sentence-transformers/all-mpnet-base-v2":
-            docsearch = FAISS.load_local(vectorstore, HuggingFaceHubEmbeddings())
-        elif embeddings_choice == "huggingface_hkunlp/instructor-large":
-            docsearch = FAISS.load_local(vectorstore, HuggingFaceInstructEmbeddings())
-        elif embeddings_choice == "cohere_medium":
-            docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key))
-
-        # create a prompt template
-        if history:
-            history = json.loads(history)
-            template_temp = template_hist.replace("{historyquestion}", history[0]).replace("{historyanswer}",
-                                                                                           history[1])
-            c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template_temp,
-                                      template_format="jinja2")
-        else:
-            c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template,
-                                      template_format="jinja2")
-
-        q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
-                                  template_format="jinja2")
-        if llm_choice == "openai":
-            llm = OpenAIChat(openai_api_key=api_key, temperature=0)
-            #llm = OpenAI(openai_api_key=api_key, temperature=0)
-        elif llm_choice == "manifest":
-            llm = ManifestWrapper(client=manifest, llm_kwargs={"temperature": 0.001, "max_tokens": 2048})
-        elif llm_choice == "huggingface":
-            llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key)
-        elif llm_choice == "cohere":
-            llm = Cohere(model="command-xlarge-nightly", cohere_api_key=api_key)
-
-        qa_chain = load_qa_chain(llm=llm, chain_type="map_reduce",
-                                 combine_prompt=c_prompt, question_prompt=q_prompt)
-
-        chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=docsearch, k=10)
-
-        # fetch the answer
-        result = chain({"query": question})
-
-        # some formatting for the frontend
-        result['answer'] = result['result']
-        result['answer'] = result['answer'].replace("\\n", "\n")
-        try:
-            result['answer'] = result['answer'].split("SOURCES:")[0]
-        except:
-            pass
-
-        # mock result
-        # result = {
-        #     "answer": "The answer is 42",
-        #     "sources": ["https://en.wikipedia.org/wiki/42_(number)", "https://en.wikipedia.org/wiki/42_(number)"]
-        # }
-        return result
-    except Exception as e:
-        # print whole traceback
-        traceback.print_exc()
-        print(str(e))
-        return bad_request(500, str(e))
+        # Handle other cases or render the default page
+        return 'Welcome to DocsGPT Backend!'


-@app.route("/api/docs_check", methods=["POST"])
-def check_docs():
-    # check if docs exist in a vectorstore folder
-    data = request.get_json()
-    vectorstore = "vectors/" + data["docs"]
-    base_path = 'https://raw.githubusercontent.com/arc53/DocsHUB/main/'
-    if os.path.exists(vectorstore) or data["docs"] == "default":
-        return {"status": 'exists'}
-    else:
-        r = requests.get(base_path + vectorstore + "index.faiss")
-
-        if r.status_code != 200:
-            return {"status": 'null'}
-        else:
-            if not os.path.exists(vectorstore):
-                os.makedirs(vectorstore)
-            with open(vectorstore + "index.faiss", "wb") as f:
-                f.write(r.content)
-
-            # download the store
-            r = requests.get(base_path + vectorstore + "index.pkl")
-            with open(vectorstore + "index.pkl", "wb") as f:
-                f.write(r.content)
-
-        return {"status": 'loaded'}


 # handling CORS
@app.after_request
 def after_request(response):
-    response.headers.add('Access-Control-Allow-Origin', '*')
-    response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
-    response.headers.add('Access-Control-Allow-Methods', 'GET,PUT,POST,DELETE,OPTIONS')
+    response.headers.add("Access-Control-Allow-Origin", "*")
+    response.headers.add("Access-Control-Allow-Headers", "Content-Type,Authorization")
+    response.headers.add("Access-Control-Allow-Methods", "GET,PUT,POST,DELETE,OPTIONS")
+    # response.headers.add("Access-Control-Allow-Credentials", "true")
    return response


 if __name__ == "__main__":
-    app.run(debug=True, port=5001)
+    app.run(debug=True, port=7091)
--- a/application/celery.py
+++ b/application/celery.py
@@ -0,0 +1,9 @@
+from celery import Celery
+from application.core.settings import settings
+
+def make_celery(app_name=__name__):
+    celery = Celery(app_name, broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND)
+    celery.conf.update(settings)
+    return celery
+
+celery = make_celery()
--- a/application/celeryconfig.py
+++ b/application/celeryconfig.py
@@ -0,0 +1,8 @@
+import os
+
+broker_url = os.getenv("CELERY_BROKER_URL")
+result_backend = os.getenv("CELERY_RESULT_BACKEND")
+
+task_serializer = 'json'
+result_serializer = 'json'
+accept_content = ['json']
--- a/application/core/init.py
+++ b/application/core/init.py
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+import os
+
+from pydantic import BaseSettings
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+class Settings(BaseSettings):
+    LLM_NAME: str = "openai"
+    EMBEDDINGS_NAME: str = "openai_text-embedding-ada-002"
+    CELERY_BROKER_URL: str = "redis://localhost:6379/0"
+    CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
+    MONGO_URI: str = "mongodb://localhost:27017/docsgpt"
+    MODEL_PATH: str = os.path.join(current_dir, "models/docsgpt-7b-f16.gguf")
+    TOKENS_MAX_HISTORY: int = 150
+    UPLOAD_FOLDER: str = "inputs"
+    VECTOR_STORE: str = "faiss"  # "faiss" or "elasticsearch"
+
+    API_URL: str = "http://localhost:7091"  # backend url for celery worker
+
+    API_KEY: str = None  # LLM api key
+    EMBEDDINGS_KEY: str = None  # api key for embeddings (if using openai, just copy API_KEY
+    OPENAI_API_BASE: str = None  # azure openai api base url
+    OPENAI_API_VERSION: str = None  # azure openai api version
+    AZURE_DEPLOYMENT_NAME: str = None  # azure deployment name for answering
+    AZURE_EMBEDDINGS_DEPLOYMENT_NAME: str = None  # azure deployment name for embeddings
+
+    # elasticsearch
+    ELASTIC_CLOUD_ID: str = None # cloud id for elasticsearch
+    ELASTIC_USERNAME: str = None # username for elasticsearch
+    ELASTIC_PASSWORD: str = None # password for elasticsearch
+    ELASTIC_URL: str = None # url for elasticsearch
+    ELASTIC_INDEX: str = "docsgpt" # index name for elasticsearch
+
+
+path = Path(__file__).parent.parent.absolute()
+settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8")
--- a/application/error.py
+++ b/application/error.py
@@ -1,13 +1,15 @@
 from flask import jsonify
 from werkzeug.http import HTTP_STATUS_CODES

-def response_error(code_status,message=None):
-    payload = {'error':HTTP_STATUS_CODES.get(code_status,"something went wrong")}
+
+def response_error(code_status, message=None):
+    payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")}
    if message:
        payload['message'] = message
    response = jsonify(payload)
    response.status_code = code_status
    return response

-def bad_request(status_code=400,message=''):
-    return response_error(code_status=status_code,message=message)
+
+def bad_request(status_code=400, message=''):
+    return response_error(code_status=status_code, message=message)
--- a/application/index.faiss
+++ b/application/index.faiss
--- a/application/index.pkl
+++ b/application/index.pkl
--- a/application/llm/init.py
+++ b/application/llm/init.py
--- a/application/llm/base.py
+++ b/application/llm/base.py
@@ -0,0 +1,14 @@
+from abc import ABC, abstractmethod
+
+
+class BaseLLM(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def gen(self, *args, **kwargs):
+        pass
+
+    @abstractmethod
+    def gen_stream(self, *args, **kwargs):
+        pass
--- a/application/llm/huggingface.py
+++ b/application/llm/huggingface.py
@@ -0,0 +1,31 @@
+from application.llm.base import BaseLLM
+
+class HuggingFaceLLM(BaseLLM):
+
+    def __init__(self, api_key, llm_name='Arc53/DocsGPT-7B'):
+        global hf
+
+        from langchain.llms import HuggingFacePipeline
+        from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+        tokenizer = AutoTokenizer.from_pretrained(llm_name)
+        model = AutoModelForCausalLM.from_pretrained(llm_name)
+        pipe = pipeline(
+            "text-generation", model=model,
+            tokenizer=tokenizer, max_new_tokens=2000,
+            device_map="auto", eos_token_id=tokenizer.eos_token_id
+        )
+        hf = HuggingFacePipeline(pipeline=pipe)
+
+    def gen(self, model, engine, messages, stream=False, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
+
+        result = hf(prompt)
+
+        return result.content
+
+    def gen_stream(self, model, engine, messages, stream=True, **kwargs):
+
+        raise NotImplementedError("HuggingFaceLLM Streaming is not implemented yet.")
+
--- a/application/llm/llama_cpp.py
+++ b/application/llm/llama_cpp.py
@@ -0,0 +1,39 @@
+from application.llm.base import BaseLLM
+from application.core.settings import settings
+
+class LlamaCpp(BaseLLM):
+
+    def __init__(self, api_key, llm_name=settings.MODEL_PATH, **kwargs):
+        global llama
+        try:
+            from llama_cpp import Llama
+        except ImportError:
+            raise ImportError("Please install llama_cpp using pip install llama-cpp-python")
+
+        llama = Llama(model_path=llm_name, n_ctx=2048)
+
+    def gen(self, model, engine, messages, stream=False, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
+
+        result = llama(prompt, max_tokens=150, echo=False)
+
+        # import sys
+        # print(result['choices'][0]['text'].split('### Answer \n')[-1], file=sys.stderr)
+        
+        return result['choices'][0]['text'].split('### Answer \n')[-1]
+
+    def gen_stream(self, model, engine, messages, stream=True, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
+
+        result = llama(prompt, max_tokens=150, echo=False, stream=stream)
+
+        # import sys
+        # print(list(result), file=sys.stderr)
+
+        for item in result:
+            for choice in item['choices']:
+                yield choice['text']
--- a/application/llm/llm_creator.py
+++ b/application/llm/llm_creator.py
@@ -0,0 +1,22 @@
+from application.llm.openai import OpenAILLM, AzureOpenAILLM
+from application.llm.sagemaker import SagemakerAPILLM
+from application.llm.huggingface import HuggingFaceLLM
+from application.llm.llama_cpp import LlamaCpp
+
+
+
+class LLMCreator:
+    llms = {
+        'openai': OpenAILLM,
+        'azure_openai': AzureOpenAILLM,
+        'sagemaker': SagemakerAPILLM,
+        'huggingface': HuggingFaceLLM,
+        'llama.cpp': LlamaCpp
+    }
+
+    @classmethod
+    def create_llm(cls, type, *args, **kwargs):
+        llm_class = cls.llms.get(type.lower())
+        if not llm_class:
+            raise ValueError(f"No LLM class found for type {type}")
+        return llm_class(*args, **kwargs)
--- a/application/llm/openai.py
+++ b/application/llm/openai.py
@@ -0,0 +1,57 @@
+from application.llm.base import BaseLLM
+from application.core.settings import settings
+
+class OpenAILLM(BaseLLM):
+
+    def __init__(self, api_key):
+        global openai
+        import openai
+        openai.api_key = api_key
+        self.api_key = api_key  # Save the API key to be used later
+
+    def _get_openai(self):
+        # Import openai when needed
+        import openai
+        # Set the API key every time you import openai
+        openai.api_key = self.api_key
+        return openai
+
+    def gen(self, model, engine, messages, stream=False, **kwargs):
+        response = openai.ChatCompletion.create(
+            model=model,
+            engine=engine,
+            messages=messages,
+            stream=stream,
+            **kwargs
+        )
+
+        return response["choices"][0]["message"]["content"]
+
+    def gen_stream(self, model, engine, messages, stream=True, **kwargs):
+        response = openai.ChatCompletion.create(
+            model=model,
+            engine=engine,
+            messages=messages,
+            stream=stream,
+            **kwargs
+        )
+
+        for line in response:
+            if "content" in line["choices"][0]["delta"]:
+                yield line["choices"][0]["delta"]["content"]
+
+
+class AzureOpenAILLM(OpenAILLM):
+
+    def __init__(self, openai_api_key, openai_api_base, openai_api_version, deployment_name):
+        super().__init__(openai_api_key)
+        self.api_base = settings.OPENAI_API_BASE,
+        self.api_version = settings.OPENAI_API_VERSION,
+        self.deployment_name = settings.AZURE_DEPLOYMENT_NAME,
+
+    def _get_openai(self):
+        openai = super()._get_openai()
+        openai.api_base = self.api_base
+        openai.api_version = self.api_version
+        openai.api_type = "azure"
+        return openai
--- a/application/llm/sagemaker.py
+++ b/application/llm/sagemaker.py
@@ -0,0 +1,27 @@
+from application.llm.base import BaseLLM
+from application.core.settings import settings
+import requests
+import json
+
+class SagemakerAPILLM(BaseLLM):
+
+    def __init__(self, *args, **kwargs):
+        self.url = settings.SAGEMAKER_API_URL
+
+    def gen(self, model, engine, messages, stream=False, **kwargs):
+        context = messages[0]['content']
+        user_question = messages[-1]['content']
+        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
+
+        response = requests.post(
+                    url=self.url,
+                    headers={
+                        "Content-Type": "application/json; charset=utf-8",
+                    },
+                    data=json.dumps({"input": prompt})
+        )
+
+        return response.json()['answer']
+
+    def gen_stream(self, model, engine, messages, stream=True, **kwargs):
+        raise NotImplementedError("Sagemaker does not support streaming")
--- a/application/parser/init.py
+++ b/application/parser/init.py
@@ -0,0 +1 @@
+
--- a/application/parser/file/init.py
+++ b/application/parser/file/init.py
@@ -0,0 +1 @@
+
--- a/application/parser/file/base.py
+++ b/application/parser/file/base.py
@@ -0,0 +1,19 @@
+"""Base reader class."""
+from abc import abstractmethod
+from typing import Any, List
+
+from langchain.docstore.document import Document as LCDocument
+from application.parser.schema.base import Document
+
+
+class BaseReader:
+    """Utilities for loading data from a directory."""
+
+    @abstractmethod
+    def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
+        """Load data from the input directory."""
+
+    def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
+        """Load data in LangChain document format."""
+        docs = self.load_data(**load_kwargs)
+        return [d.to_langchain_format() for d in docs]
--- a/application/parser/file/base_parser.py
+++ b/application/parser/file/base_parser.py
@@ -0,0 +1,38 @@
+"""Base parser and config class."""
+
+from abc import abstractmethod
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+
+class BaseParser:
+    """Base class for all parsers."""
+
+    def __init__(self, parser_config: Optional[Dict] = None):
+        """Init params."""
+        self._parser_config = parser_config
+
+    def init_parser(self) -> None:
+        """Init parser and store it."""
+        parser_config = self._init_parser()
+        self._parser_config = parser_config
+
+    @property
+    def parser_config_set(self) -> bool:
+        """Check if parser config is set."""
+        return self._parser_config is not None
+
+    @property
+    def parser_config(self) -> Dict:
+        """Check if parser config is set."""
+        if self._parser_config is None:
+            raise ValueError("Parser config not set.")
+        return self._parser_config
+
+    @abstractmethod
+    def _init_parser(self) -> Dict:
+        """Initialize the parser with the config."""
+
+    @abstractmethod
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file."""
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -0,0 +1,163 @@
+"""Simple reader that reads files of different formats from a directory."""
+import logging
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Union
+
+from application.parser.file.base import BaseReader
+from application.parser.file.base_parser import BaseParser
+from application.parser.file.docs_parser import DocxParser, PDFParser
+from application.parser.file.epub_parser import EpubParser
+from application.parser.file.html_parser import HTMLParser
+from application.parser.file.markdown_parser import MarkdownParser
+from application.parser.file.rst_parser import RstParser
+from application.parser.file.tabular_parser import PandasCSVParser
+from application.parser.schema.base import Document
+
+DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
+    ".pdf": PDFParser(),
+    ".docx": DocxParser(),
+    ".csv": PandasCSVParser(),
+    ".epub": EpubParser(),
+    ".md": MarkdownParser(),
+    ".rst": RstParser(),
+    ".html": HTMLParser(),
+    ".mdx": MarkdownParser(),
+}
+
+
+class SimpleDirectoryReader(BaseReader):
+    """Simple directory reader.
+
+    Can read files into separate documents, or concatenates
+    files into one document text.
+
+    Args:
+        input_dir (str): Path to the directory.
+        input_files (List): List of file paths to read (Optional; overrides input_dir)
+        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
+        errors (str): how encoding and decoding errors are to be handled,
+              see https://docs.python.org/3/library/functions.html#open
+        recursive (bool): Whether to recursively search in subdirectories.
+            False by default.
+        required_exts (Optional[List[str]]): List of required extensions.
+            Default is None.
+        file_extractor (Optional[Dict[str, BaseParser]]): A mapping of file
+            extension to a BaseParser class that specifies how to convert that file
+            to text. See DEFAULT_FILE_EXTRACTOR.
+        num_files_limit (Optional[int]): Maximum number of files to read.
+            Default is None.
+        file_metadata (Optional[Callable[str, Dict]]): A function that takes
+            in a filename and returns a Dict of metadata for the Document.
+            Default is None.
+    """
+
+    def __init__(
+            self,
+            input_dir: Optional[str] = None,
+            input_files: Optional[List] = None,
+            exclude_hidden: bool = True,
+            errors: str = "ignore",
+            recursive: bool = True,
+            required_exts: Optional[List[str]] = None,
+            file_extractor: Optional[Dict[str, BaseParser]] = None,
+            num_files_limit: Optional[int] = None,
+            file_metadata: Optional[Callable[[str], Dict]] = None,
+            chunk_size_max: int = 2048,
+    ) -> None:
+        """Initialize with parameters."""
+        super().__init__()
+
+        if not input_dir and not input_files:
+            raise ValueError("Must provide either `input_dir` or `input_files`.")
+
+        self.errors = errors
+
+        self.recursive = recursive
+        self.exclude_hidden = exclude_hidden
+        self.required_exts = required_exts
+        self.num_files_limit = num_files_limit
+
+        if input_files:
+            self.input_files = []
+            for path in input_files:
+                print(path)
+                input_file = Path(path)
+                self.input_files.append(input_file)
+        elif input_dir:
+            self.input_dir = Path(input_dir)
+            self.input_files = self._add_files(self.input_dir)
+
+        self.file_extractor = file_extractor or DEFAULT_FILE_EXTRACTOR
+        self.file_metadata = file_metadata
+
+    def _add_files(self, input_dir: Path) -> List[Path]:
+        """Add files."""
+        input_files = sorted(input_dir.iterdir())
+        new_input_files = []
+        dirs_to_explore = []
+        for input_file in input_files:
+            if input_file.is_dir():
+                if self.recursive:
+                    dirs_to_explore.append(input_file)
+            elif self.exclude_hidden and input_file.name.startswith("."):
+                continue
+            elif (
+                    self.required_exts is not None
+                    and input_file.suffix not in self.required_exts
+            ):
+                continue
+            else:
+                new_input_files.append(input_file)
+
+        for dir_to_explore in dirs_to_explore:
+            sub_input_files = self._add_files(dir_to_explore)
+            new_input_files.extend(sub_input_files)
+
+        if self.num_files_limit is not None and self.num_files_limit > 0:
+            new_input_files = new_input_files[0: self.num_files_limit]
+
+        # print total number of files added
+        logging.debug(
+            f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}"
+        )
+
+        return new_input_files
+
+    def load_data(self, concatenate: bool = False) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            concatenate (bool): whether to concatenate all files into one document.
+                If set to True, file metadata is ignored.
+                False by default.
+
+        Returns:
+            List[Document]: A list of documents.
+
+        """
+        data: Union[str, List[str]] = ""
+        data_list: List[str] = []
+        metadata_list = []
+        for input_file in self.input_files:
+            if input_file.suffix in self.file_extractor:
+                parser = self.file_extractor[input_file.suffix]
+                if not parser.parser_config_set:
+                    parser.init_parser()
+                data = parser.parse_file(input_file, errors=self.errors)
+            else:
+                # do standard read
+                with open(input_file, "r", errors=self.errors) as f:
+                    data = f.read()
+            if isinstance(data, List):
+                data_list.extend(data)
+            else:
+                data_list.append(str(data))
+            if self.file_metadata is not None:
+                metadata_list.append(self.file_metadata(str(input_file)))
+
+        if concatenate:
+            return [Document("\n".join(data_list))]
+        elif self.file_metadata is not None:
+            return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)]
+        else:
+            return [Document(d) for d in data_list]
--- a/application/parser/file/docs_parser.py
+++ b/application/parser/file/docs_parser.py
@@ -0,0 +1,59 @@
+"""Docs parser.
+
+Contains parsers for docx, pdf files.
+
+"""
+from pathlib import Path
+from typing import Dict
+
+from application.parser.file.base_parser import BaseParser
+
+
+class PDFParser(BaseParser):
+    """PDF parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> str:
+        """Parse file."""
+        try:
+            import PyPDF2
+        except ImportError:
+            raise ValueError("PyPDF2 is required to read PDF files.")
+        text_list = []
+        with open(file, "rb") as fp:
+            # Create a PDF object
+            pdf = PyPDF2.PdfReader(fp)
+
+            # Get the number of pages in the PDF document
+            num_pages = len(pdf.pages)
+
+            # Iterate over every page
+            for page in range(num_pages):
+                # Extract the text from the page
+                page_text = pdf.pages[page].extract_text()
+                text_list.append(page_text)
+        text = "\n".join(text_list)
+
+        return text
+
+
+class DocxParser(BaseParser):
+    """Docx parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> str:
+        """Parse file."""
+        try:
+            import docx2txt
+        except ImportError:
+            raise ValueError("docx2txt is required to read Microsoft Word files.")
+
+        text = docx2txt.process(file)
+
+        return text
--- a/application/parser/file/epub_parser.py
+++ b/application/parser/file/epub_parser.py
@@ -0,0 +1,43 @@
+"""Epub parser.
+
+Contains parsers for epub files.
+"""
+
+from pathlib import Path
+from typing import Dict
+
+from application.parser.file.base_parser import BaseParser
+
+
+class EpubParser(BaseParser):
+    """Epub Parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> str:
+        """Parse file."""
+        try:
+            import ebooklib
+            from ebooklib import epub
+        except ImportError:
+            raise ValueError("`EbookLib` is required to read Epub files.")
+        try:
+            import html2text
+        except ImportError:
+            raise ValueError("`html2text` is required to parse Epub files.")
+
+        text_list = []
+        book = epub.read_epub(file, options={"ignore_ncx": True})
+
+        # Iterate through all chapters.
+        for item in book.get_items():
+            # Chapters are typically located in epub documents items.
+            if item.get_type() == ebooklib.ITEM_DOCUMENT:
+                text_list.append(
+                    html2text.html2text(item.get_content().decode("utf-8"))
+                )
+
+        text = "\n".join(text_list)
+        return text
--- a/application/parser/file/html_parser.py
+++ b/application/parser/file/html_parser.py
@@ -0,0 +1,83 @@
+"""HTML parser.
+
+Contains parser for html files.
+
+"""
+import re
+from pathlib import Path
+from typing import Dict, Union
+
+from application.parser.file.base_parser import BaseParser
+
+
+class HTMLParser(BaseParser):
+    """HTML parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
+        """Parse file.
+
+            Returns:
+            Union[str, List[str]]: a string or a List of strings.
+        """
+        try:
+            from unstructured.partition.html import partition_html
+            from unstructured.staging.base import convert_to_isd
+            from unstructured.cleaners.core import clean
+        except ImportError:
+            raise ValueError("unstructured package is required to parse HTML files.")
+
+        # Using the unstructured library to convert the html to isd format
+        # isd sample : isd = [
+        #   {"text": "My Title", "type": "Title"},
+        #   {"text": "My Narrative", "type": "NarrativeText"}
+        # ]
+        with open(file, "r", encoding="utf-8") as fp:
+            elements = partition_html(file=fp)
+            isd = convert_to_isd(elements)
+
+            # Removing non ascii charactwers from isd_el['text']
+        for isd_el in isd:
+            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
+
+        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
+        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
+        for isd_el in isd:
+            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
+            isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
+
+        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
+        for isd_el in isd:
+            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
+
+        # Creating a list of all the indexes of isd_el['type'] = 'Title'
+        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
+
+        # Creating 'Chunks' - List of lists of strings 
+        # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
+        # Each Chunk can be thought of as an individual set of data, which can be sent to the model
+        # Where Each Title is grouped together with the data under it
+
+        Chunks = [[]]
+        final_chunks = list(list())
+
+        for i, isd_el in enumerate(isd):
+            if i in title_indexes:
+                Chunks.append([])
+            Chunks[-1].append(isd_el['text'])
+
+        # Removing all the chunks with sum of length of all the strings in the chunk < 25
+        # TODO: This value can be an user defined variable
+        for chunk in Chunks:
+            # sum of length of all the strings in the chunk
+            sum = 0
+            sum += len(str(chunk))
+            if sum < 25:
+                Chunks.remove(chunk)
+            else:
+                # appending all the approved chunks to final_chunks as a single string       
+                final_chunks.append(" ".join([str(item) for item in chunk]))
+        return final_chunks
--- a/application/parser/file/markdown_parser.py
+++ b/application/parser/file/markdown_parser.py
@@ -0,0 +1,145 @@
+"""Markdown parser.
+
+Contains parser for md files.
+
+"""
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
+
+import tiktoken
+from application.parser.file.base_parser import BaseParser
+
+
+class MarkdownParser(BaseParser):
+    """Markdown parser.
+
+    Extract text from markdown files.
+    Returns dictionary with keys as headers and values as the text between headers.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            remove_hyperlinks: bool = True,
+            remove_images: bool = True,
+            max_tokens: int = 2048,
+            # remove_tables: bool = True,
+            **kwargs: Any,
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._remove_hyperlinks = remove_hyperlinks
+        self._remove_images = remove_images
+        self._max_tokens = max_tokens
+        # self._remove_tables = remove_tables
+
+    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
+                          current_text: str):
+        """Append to tups chunk."""
+        num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
+        if num_tokens > self._max_tokens:
+            chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)]
+            for chunk in chunks:
+                tups.append((current_header, chunk))
+        else:
+            tups.append((current_header, current_text))
+        return tups
+
+    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
+        """Convert a markdown file to a dictionary.
+
+        The keys are the headers and the values are the text under each header.
+
+        """
+        markdown_tups: List[Tuple[Optional[str], str]] = []
+        lines = markdown_text.split("\n")
+
+        current_header = None
+        current_text = ""
+
+        for line in lines:
+            header_match = re.match(r"^#+\s", line)
+            if header_match:
+                if current_header is not None:
+                    if current_text == "" or None:
+                        continue
+                    markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text)
+
+                current_header = line
+                current_text = ""
+            else:
+                current_text += line + "\n"
+        markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text)
+
+        if current_header is not None:
+            # pass linting, assert keys are defined
+            markdown_tups = [
+                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
+                for key, value in markdown_tups
+            ]
+        else:
+            markdown_tups = [
+                (key, re.sub("\n", "", value)) for key, value in markdown_tups
+            ]
+
+        return markdown_tups
+
+    def remove_images(self, content: str) -> str:
+        """Get a dictionary of a markdown file from its path."""
+        pattern = r"!{1}\[\[(.*)\]\]"
+        content = re.sub(pattern, "", content)
+        return content
+
+    # def remove_tables(self, content: str) -> List[List[str]]:
+    #     """Convert markdown tables to nested lists."""
+    #     table_rows_pattern = r"((\r?\n){2}|^)([^\r\n]*\|[^\r\n]*(\r?\n)?)+(?=(\r?\n){2}|$)"
+    #     table_cells_pattern = r"([^\|\r\n]*)\|"
+    #
+    #     table_rows = re.findall(table_rows_pattern, content, re.MULTILINE)
+    #     table_lists = []
+    #     for row in table_rows:
+    #         cells = re.findall(table_cells_pattern, row[2])
+    #         cells = [cell.strip() for cell in cells if cell.strip()]
+    #         table_lists.append(cells)
+    #     return str(table_lists)
+
+    def remove_hyperlinks(self, content: str) -> str:
+        """Get a dictionary of a markdown file from its path."""
+        pattern = r"\[(.*?)\]\((.*?)\)"
+        content = re.sub(pattern, r"\1", content)
+        return content
+
+    def _init_parser(self) -> Dict:
+        """Initialize the parser with the config."""
+        return {}
+
+    def parse_tups(
+            self, filepath: Path, errors: str = "ignore"
+    ) -> List[Tuple[Optional[str], str]]:
+        """Parse file into tuples."""
+        with open(filepath, "r") as f:
+            content = f.read()
+        if self._remove_hyperlinks:
+            content = self.remove_hyperlinks(content)
+        if self._remove_images:
+            content = self.remove_images(content)
+        # if self._remove_tables:
+        #     content = self.remove_tables(content)
+        markdown_tups = self.markdown_to_tups(content)
+        return markdown_tups
+
+    def parse_file(
+            self, filepath: Path, errors: str = "ignore"
+    ) -> Union[str, List[str]]:
+        """Parse file into string."""
+        tups = self.parse_tups(filepath, errors=errors)
+        results = []
+        # TODO: don't include headers right now
+        for header, value in tups:
+            if header is None:
+                results.append(value)
+            else:
+                results.append(f"\n\n{header}\n{value}")
+        return results
--- a/application/parser/file/rst_parser.py
+++ b/application/parser/file/rst_parser.py
@@ -0,0 +1,173 @@
+"""reStructuredText parser.
+
+Contains parser for md files.
+
+"""
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from application.parser.file.base_parser import BaseParser
+
+
+class RstParser(BaseParser):
+    """reStructuredText parser.
+
+    Extract text from .rst files.
+    Returns dictionary with keys as headers and values as the text between headers.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            remove_hyperlinks: bool = True,
+            remove_images: bool = True,
+            remove_table_excess: bool = True,
+            remove_interpreters: bool = True,
+            remove_directives: bool = True,
+            remove_whitespaces_excess: bool = True,
+            # Be careful with remove_characters_excess, might cause data loss
+            remove_characters_excess: bool = True,
+            **kwargs: Any,
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._remove_hyperlinks = remove_hyperlinks
+        self._remove_images = remove_images
+        self._remove_table_excess = remove_table_excess
+        self._remove_interpreters = remove_interpreters
+        self._remove_directives = remove_directives
+        self._remove_whitespaces_excess = remove_whitespaces_excess
+        self._remove_characters_excess = remove_characters_excess
+
+    def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
+        """Convert a reStructuredText file to a dictionary.
+
+        The keys are the headers and the values are the text under each header.
+
+        """
+        rst_tups: List[Tuple[Optional[str], str]] = []
+        lines = rst_text.split("\n")
+
+        current_header = None
+        current_text = ""
+
+        for i, line in enumerate(lines):
+            header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
+            if header_match and i > 0 and (
+                    len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
+                if current_header is not None:
+                    if current_text == "" or None:
+                        continue
+                    # removes the next heading from current Document
+                    if current_text.endswith(lines[i - 1] + "\n"):
+                        current_text = current_text[:len(current_text) - len(lines[i - 1] + "\n")]
+                    rst_tups.append((current_header, current_text))
+
+                current_header = lines[i - 1]
+                current_text = ""
+            else:
+                current_text += line + "\n"
+
+        rst_tups.append((current_header, current_text))
+
+        # TODO: Format for rst
+        #
+        # if current_header is not None:
+        #     # pass linting, assert keys are defined
+        #     rst_tups = [
+        #         (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
+        #         for key, value in rst_tups
+        #     ]
+        # else:
+        #     rst_tups = [
+        #         (key, re.sub("\n", "", value)) for key, value in rst_tups
+        #     ]
+
+        if current_header is None:
+            rst_tups = [
+                (key, re.sub("\n", "", value)) for key, value in rst_tups
+            ]
+        return rst_tups
+
+    def remove_images(self, content: str) -> str:
+        pattern = r"\.\. image:: (.*)"
+        content = re.sub(pattern, "", content)
+        return content
+
+    def remove_hyperlinks(self, content: str) -> str:
+        pattern = r"`(.*?) <(.*?)>`_"
+        content = re.sub(pattern, r"\1", content)
+        return content
+
+    def remove_directives(self, content: str) -> str:
+        """Removes reStructuredText Directives"""
+        pattern = r"`\.\.([^:]+)::"
+        content = re.sub(pattern, "", content)
+        return content
+
+    def remove_interpreters(self, content: str) -> str:
+        """Removes reStructuredText Interpreted Text Roles"""
+        pattern = r":(\w+):"
+        content = re.sub(pattern, "", content)
+        return content
+
+    def remove_table_excess(self, content: str) -> str:
+        """Pattern to remove grid table separators"""
+        pattern = r"^\+[-]+\+[-]+\+$"
+        content = re.sub(pattern, "", content, flags=re.MULTILINE)
+        return content
+
+    def remove_whitespaces_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]:
+        """Pattern to match 2 or more consecutive whitespaces"""
+        pattern = r"\s{2,}"
+        content = [(key, re.sub(pattern, "  ", value)) for key, value in content]
+        return content
+
+    def remove_characters_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]:
+        """Pattern to match 2 or more consecutive characters"""
+        pattern = r"(\S)\1{2,}"
+        content = [(key, re.sub(pattern, r"\1\1\1", value, flags=re.MULTILINE)) for key, value in content]
+        return content
+
+    def _init_parser(self) -> Dict:
+        """Initialize the parser with the config."""
+        return {}
+
+    def parse_tups(
+            self, filepath: Path, errors: str = "ignore"
+    ) -> List[Tuple[Optional[str], str]]:
+        """Parse file into tuples."""
+        with open(filepath, "r") as f:
+            content = f.read()
+        if self._remove_hyperlinks:
+            content = self.remove_hyperlinks(content)
+        if self._remove_images:
+            content = self.remove_images(content)
+        if self._remove_table_excess:
+            content = self.remove_table_excess(content)
+        if self._remove_directives:
+            content = self.remove_directives(content)
+        if self._remove_interpreters:
+            content = self.remove_interpreters(content)
+        rst_tups = self.rst_to_tups(content)
+        if self._remove_whitespaces_excess:
+            rst_tups = self.remove_whitespaces_excess(rst_tups)
+        if self._remove_characters_excess:
+            rst_tups = self.remove_characters_excess(rst_tups)
+        return rst_tups
+
+    def parse_file(
+            self, filepath: Path, errors: str = "ignore"
+    ) -> Union[str, List[str]]:
+        """Parse file into string."""
+        tups = self.parse_tups(filepath, errors=errors)
+        results = []
+        # TODO: don't include headers right now
+        for header, value in tups:
+            if header is None:
+                results.append(value)
+            else:
+                results.append(f"\n\n{header}\n{value}")
+        return results
--- a/application/parser/file/tabular_parser.py
+++ b/application/parser/file/tabular_parser.py
@@ -0,0 +1,115 @@
+"""Tabular parser.
+
+Contains parsers for tabular data files.
+
+"""
+from pathlib import Path
+from typing import Any, Dict, List, Union
+
+from application.parser.file.base_parser import BaseParser
+
+
+class CSVParser(BaseParser):
+    """CSV parser.
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+
+    """
+
+    def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file.
+
+        Returns:
+            Union[str, List[str]]: a string or a List of strings.
+
+        """
+        try:
+            import csv
+        except ImportError:
+            raise ValueError("csv module is required to read CSV files.")
+        text_list = []
+        with open(file, "r") as fp:
+            csv_reader = csv.reader(fp)
+            for row in csv_reader:
+                text_list.append(", ".join(row))
+        if self._concat_rows:
+            return "\n".join(text_list)
+        else:
+            return text_list
+
+
+class PandasCSVParser(BaseParser):
+    r"""Pandas-based CSV parser.
+
+    Parses CSVs using the separator detection from Pandas `read_csv`function.
+    If special parameters are required, use the `pandas_config` dict.
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+
+        col_joiner (str): Separator to use for joining cols per row.
+            Set to ", " by default.
+
+        row_joiner (str): Separator to use for joining each row.
+            Only used when `concat_rows=True`.
+            Set to "\n" by default.
+
+        pandas_config (dict): Options for the `pandas.read_csv` function call.
+            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
+            for more information.
+            Set to empty dict by default, this means pandas will try to figure
+            out the separators, table head, etc. on its own.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            concat_rows: bool = True,
+            col_joiner: str = ", ",
+            row_joiner: str = "\n",
+            pandas_config: dict = {},
+            **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._col_joiner = col_joiner
+        self._row_joiner = row_joiner
+        self._pandas_config = pandas_config
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file."""
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ValueError("pandas module is required to read CSV files.")
+
+        df = pd.read_csv(file, **self._pandas_config)
+
+        text_list = df.apply(
+            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+        ).tolist()
+
+        if self._concat_rows:
+            return (self._row_joiner).join(text_list)
+        else:
+            return text_list
--- a/application/parser/java2doc.py
+++ b/application/parser/java2doc.py
@@ -0,0 +1,66 @@
+import os
+
+import javalang
+
+
+def find_files(directory):
+    files_list = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.java'):
+                files_list.append(os.path.join(root, file))
+    return files_list
+
+
+def extract_functions(file_path):
+    with open(file_path, "r") as file:
+        java_code = file.read()
+        methods = {}
+        tree = javalang.parse.parse(java_code)
+        for _, node in tree.filter(javalang.tree.MethodDeclaration):
+            method_name = node.name
+            start_line = node.position.line - 1
+            end_line = start_line
+            brace_count = 0
+            for line in java_code.splitlines()[start_line:]:
+                end_line += 1
+                brace_count += line.count("{") - line.count("}")
+                if brace_count == 0:
+                    break
+            method_source_code = "\n".join(java_code.splitlines()[start_line:end_line])
+            methods[method_name] = method_source_code
+    return methods
+
+
+def extract_classes(file_path):
+    with open(file_path, 'r') as file:
+        source_code = file.read()
+        classes = {}
+        tree = javalang.parse.parse(source_code)
+        for class_decl in tree.types:
+            class_name = class_decl.name
+            declarations = []
+            methods = []
+            for field_decl in class_decl.fields:
+                field_name = field_decl.declarators[0].name
+                field_type = field_decl.type.name
+                declarations.append(f"{field_type} {field_name}")
+            for method_decl in class_decl.methods:
+                methods.append(method_decl.name)
+            class_string = "Declarations: " + ", ".join(declarations) + "\n  Method name: " + ", ".join(methods)
+            classes[class_name] = class_string
+    return classes
+
+
+def extract_functions_and_classes(directory):
+    files = find_files(directory)
+    functions_dict = {}
+    classes_dict = {}
+    for file in files:
+        functions = extract_functions(file)
+        if functions:
+            functions_dict[file] = functions
+        classes = extract_classes(file)
+        if classes:
+            classes_dict[file] = classes
+    return functions_dict, classes_dict
--- a/application/parser/js2doc.py
+++ b/application/parser/js2doc.py
@@ -0,0 +1,70 @@
+import os
+
+import escodegen
+import esprima
+
+
+def find_files(directory):
+    files_list = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.js'):
+                files_list.append(os.path.join(root, file))
+    return files_list
+
+
+def extract_functions(file_path):
+    with open(file_path, 'r') as file:
+        source_code = file.read()
+        functions = {}
+        tree = esprima.parseScript(source_code)
+        for node in tree.body:
+            if node.type == 'FunctionDeclaration':
+                func_name = node.id.name if node.id else '<anonymous>'
+                functions[func_name] = escodegen.generate(node)
+            elif node.type == 'VariableDeclaration':
+                for declaration in node.declarations:
+                    if declaration.init and declaration.init.type == 'FunctionExpression':
+                        func_name = declaration.id.name if declaration.id else '<anonymous>'
+                        functions[func_name] = escodegen.generate(declaration.init)
+            elif node.type == 'ClassDeclaration':
+                for subnode in node.body.body:
+                    if subnode.type == 'MethodDefinition':
+                        func_name = subnode.key.name
+                        functions[func_name] = escodegen.generate(subnode.value)
+                    elif subnode.type == 'VariableDeclaration':
+                        for declaration in subnode.declarations:
+                            if declaration.init and declaration.init.type == 'FunctionExpression':
+                                func_name = declaration.id.name if declaration.id else '<anonymous>'
+                                functions[func_name] = escodegen.generate(declaration.init)
+        return functions
+
+
+def extract_classes(file_path):
+    with open(file_path, 'r') as file:
+        source_code = file.read()
+        classes = {}
+        tree = esprima.parseScript(source_code)
+        for node in tree.body:
+            if node.type == 'ClassDeclaration':
+                class_name = node.id.name
+                function_names = []
+                for subnode in node.body.body:
+                    if subnode.type == 'MethodDefinition':
+                        function_names.append(subnode.key.name)
+                classes[class_name] = ", ".join(function_names)
+    return classes
+
+
+def extract_functions_and_classes(directory):
+    files = find_files(directory)
+    functions_dict = {}
+    classes_dict = {}
+    for file in files:
+        functions = extract_functions(file)
+        if functions:
+            functions_dict[file] = functions
+        classes = extract_classes(file)
+        if classes:
+            classes_dict[file] = classes
+    return functions_dict, classes_dict
--- a/application/parser/open_ai_func.py
+++ b/application/parser/open_ai_func.py
@@ -0,0 +1,94 @@
+import os
+
+import tiktoken
+from application.vectorstore.vector_creator import VectorCreator
+from application.core.settings import settings
+from retry import retry
+
+
+# from langchain.embeddings import HuggingFaceEmbeddings
+# from langchain.embeddings import HuggingFaceInstructEmbeddings
+# from langchain.embeddings import CohereEmbeddings
+
+
+def num_tokens_from_string(string: str, encoding_name: str) -> int:
+    # Function to convert string to tokens and estimate user cost.
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    total_price = ((num_tokens / 1000) * 0.0004)
+    return num_tokens, total_price
+
+
+@retry(tries=10, delay=60)
+def store_add_texts_with_retry(store, i):
+    store.add_texts([i.page_content], metadatas=[i.metadata])
+    # store_pine.add_texts([i.page_content], metadatas=[i.metadata])
+
+
+def call_openai_api(docs, folder_name, task_status):
+    # Function to create a vector store from the documents and save it to disk.
+
+    # create output folder if it doesn't exist
+    if not os.path.exists(f"{folder_name}"):
+        os.makedirs(f"{folder_name}")
+
+    from tqdm import tqdm
+    c1 = 0
+    if settings.VECTOR_STORE == "faiss":
+        docs_init = [docs[0]]
+        docs.pop(0)
+
+        store = VectorCreator.create_vectorstore(
+            settings.VECTOR_STORE,
+            docs_init = docs_init,
+            path=f"{folder_name}",
+            embeddings_key=os.getenv("EMBEDDINGS_KEY")
+        )
+    else:
+        store = VectorCreator.create_vectorstore(
+            settings.VECTOR_STORE,
+            path=f"{folder_name}",
+            embeddings_key=os.getenv("EMBEDDINGS_KEY")
+        )
+    # Uncomment for MPNet embeddings
+    # model_name = "sentence-transformers/all-mpnet-base-v2"
+    # hf = HuggingFaceEmbeddings(model_name=model_name)
+    # store = FAISS.from_documents(docs_test, hf)
+    s1 = len(docs)
+    for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
+                  bar_format='{l_bar}{bar}| Time Left: {remaining}'):
+        try:
+            task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)})
+            store_add_texts_with_retry(store, i)
+        except Exception as e:
+            print(e)
+            print("Error on ", i)
+            print("Saving progress")
+            print(f"stopped at {c1} out of {len(docs)}")
+            store.save_local(f"{folder_name}")
+            break
+        c1 += 1
+    if settings.VECTOR_STORE == "faiss":
+        store.save_local(f"{folder_name}")
+
+
+def get_user_permission(docs, folder_name):
+    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
+    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
+    # docs_content = (" ".join(docs))
+    docs_content = ""
+    for doc in docs:
+        docs_content += doc.page_content
+
+    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
+    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
+    print(f"Number of Tokens = {format(tokens, ',d')}")
+    print(f"Approx Cost = ${format(total_price, ',.2f')}")
+    # Here we check for user permission before calling the API.
+    user_input = input("Price Okay? (Y/N) \n").lower()
+    if user_input == "y":
+        call_openai_api(docs, folder_name)
+    elif user_input == "":
+        call_openai_api(docs, folder_name)
+    else:
+        print("The API was not called. No money was spent.")
--- a/application/parser/py2doc.py
+++ b/application/parser/py2doc.py
@@ -0,0 +1,121 @@
+import ast
+import os
+from pathlib import Path
+
+import tiktoken
+from langchain.llms import OpenAI
+from langchain.prompts import PromptTemplate
+
+
+def find_files(directory):
+    files_list = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.py'):
+                files_list.append(os.path.join(root, file))
+    return files_list
+
+
+def extract_functions(file_path):
+    with open(file_path, 'r') as file:
+        source_code = file.read()
+        functions = {}
+        tree = ast.parse(source_code)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef):
+                func_name = node.name
+                func_def = ast.get_source_segment(source_code, node)
+                functions[func_name] = func_def
+    return functions
+
+
+def extract_classes(file_path):
+    with open(file_path, 'r') as file:
+        source_code = file.read()
+        classes = {}
+        tree = ast.parse(source_code)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.ClassDef):
+                class_name = node.name
+                function_names = []
+                for subnode in ast.walk(node):
+                    if isinstance(subnode, ast.FunctionDef):
+                        function_names.append(subnode.name)
+                classes[class_name] = ", ".join(function_names)
+    return classes
+
+
+def extract_functions_and_classes(directory):
+    files = find_files(directory)
+    functions_dict = {}
+    classes_dict = {}
+    for file in files:
+        functions = extract_functions(file)
+        if functions:
+            functions_dict[file] = functions
+        classes = extract_classes(file)
+        if classes:
+            classes_dict[file] = classes
+    return functions_dict, classes_dict
+
+
+def parse_functions(functions_dict, formats, dir):
+    c1 = len(functions_dict)
+    for i, (source, functions) in enumerate(functions_dict.items(), start=1):
+        print(f"Processing file {i}/{c1}")
+        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
+        subfolders = "/".join(source_w.split("/")[:-1])
+        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
+        for j, (name, function) in enumerate(functions.items(), start=1):
+            print(f"Processing function {j}/{len(functions)}")
+            prompt = PromptTemplate(
+                input_variables=["code"],
+                template="Code: \n{code}, \nDocumentation: ",
+            )
+            llm = OpenAI(temperature=0)
+            response = llm(prompt.format(code=function))
+            mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
+            with open(f"outputs/{source_w}", mode) as f:
+                f.write(
+                    f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
+
+
+def parse_classes(classes_dict, formats, dir):
+    c1 = len(classes_dict)
+    for i, (source, classes) in enumerate(classes_dict.items()):
+        print(f"Processing file {i + 1}/{c1}")
+        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
+        subfolders = "/".join(source_w.split("/")[:-1])
+        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
+        for name, function_names in classes.items():
+            print(f"Processing Class {i + 1}/{c1}")
+            prompt = PromptTemplate(
+                input_variables=["class_name", "functions_names"],
+                template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
+            )
+            llm = OpenAI(temperature=0)
+            response = llm(prompt.format(class_name=name, functions_names=function_names))
+
+            with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
+                f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
+
+
+def transform_to_docs(functions_dict, classes_dict, formats, dir):
+    docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
+    docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
+
+    num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(docs_content))
+    total_price = ((num_tokens / 1000) * 0.02)
+
+    print(f"Number of Tokens = {num_tokens:,d}")
+    print(f"Approx Cost = ${total_price:,.2f}")
+
+    user_input = input("Price Okay? (Y/N)\n").lower()
+    if user_input == "y" or user_input == "":
+        if not Path("outputs").exists():
+            Path("outputs").mkdir()
+        parse_functions(functions_dict, formats, dir)
+        parse_classes(classes_dict, formats, dir)
+        print("All done!")
+    else:
+        print("The API was not called. No money was spent.")
--- a/application/parser/schema/init.py
+++ b/application/parser/schema/init.py
@@ -0,0 +1 @@
+
--- a/application/parser/schema/base.py
+++ b/application/parser/schema/base.py
@@ -0,0 +1,34 @@
+"""Base schema for readers."""
+from dataclasses import dataclass
+
+from langchain.docstore.document import Document as LCDocument
+from application.parser.schema.schema import BaseDocument
+
+
+@dataclass
+class Document(BaseDocument):
+    """Generic interface for a data document.
+
+    This document connects to data sources.
+
+    """
+
+    def __post_init__(self) -> None:
+        """Post init."""
+        if self.text is None:
+            raise ValueError("text field not set.")
+
+    @classmethod
+    def get_type(cls) -> str:
+        """Get Document type."""
+        return "Document"
+
+    def to_langchain_format(self) -> LCDocument:
+        """Convert struct to LangChain document format."""
+        metadata = self.extra_info or {}
+        return LCDocument(page_content=self.text, metadata=metadata)
+
+    @classmethod
+    def from_langchain_format(cls, doc: LCDocument) -> "Document":
+        """Convert struct from LangChain document format."""
+        return cls(text=doc.page_content, extra_info=doc.metadata)
--- a/application/parser/schema/schema.py
+++ b/application/parser/schema/schema.py
@@ -0,0 +1,64 @@
+"""Base schema for data structures."""
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from dataclasses_json import DataClassJsonMixin
+
+
+@dataclass
+class BaseDocument(DataClassJsonMixin):
+    """Base document.
+
+    Generic abstract interfaces that captures both index structs
+    as well as documents.
+
+    """
+
+    # TODO: consolidate fields from Document/IndexStruct into base class
+    text: Optional[str] = None
+    doc_id: Optional[str] = None
+    embedding: Optional[List[float]] = None
+
+    # extra fields
+    extra_info: Optional[Dict[str, Any]] = None
+
+    @classmethod
+    @abstractmethod
+    def get_type(cls) -> str:
+        """Get Document type."""
+
+    def get_text(self) -> str:
+        """Get text."""
+        if self.text is None:
+            raise ValueError("text field not set.")
+        return self.text
+
+    def get_doc_id(self) -> str:
+        """Get doc_id."""
+        if self.doc_id is None:
+            raise ValueError("doc_id not set.")
+        return self.doc_id
+
+    @property
+    def is_doc_id_none(self) -> bool:
+        """Check if doc_id is None."""
+        return self.doc_id is None
+
+    def get_embedding(self) -> List[float]:
+        """Get embedding.
+
+        Errors if embedding is None.
+
+        """
+        if self.embedding is None:
+            raise ValueError("embedding not set.")
+        return self.embedding
+
+    @property
+    def extra_info_str(self) -> Optional[str]:
+        """Extra info string."""
+        if self.extra_info is None:
+            return None
+
+        return "\n".join([f"{k}: {str(v)}" for k, v in self.extra_info.items()])
--- a/application/parser/token_func.py
+++ b/application/parser/token_func.py
@@ -0,0 +1,77 @@
+import re
+from math import ceil
+from typing import List
+
+import tiktoken
+from application.parser.schema.base import Document
+
+
+def separate_header_and_body(text):
+    header_pattern = r"^(.*?\n){3}"
+    match = re.match(header_pattern, text)
+    header = match.group(0)
+    body = text[len(header):]
+    return header, body
+
+
+def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
+    docs = []
+    current_group = None
+
+    for doc in documents:
+        doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
+
+        if current_group is None:
+            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
+                                     extra_info=doc.extra_info)
+        elif len(tiktoken.get_encoding("cl100k_base").encode(
+                current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
+            current_group.text += " " + doc.text
+        else:
+            docs.append(current_group)
+            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
+                                     extra_info=doc.extra_info)
+
+    if current_group is not None:
+        docs.append(current_group)
+
+    return docs
+
+
+def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
+    docs = []
+    for doc in documents:
+        token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
+        if token_length <= max_tokens:
+            docs.append(doc)
+        else:
+            header, body = separate_header_and_body(doc.text)
+            if len(tiktoken.get_encoding("cl100k_base").encode(header)) > max_tokens:
+                body = doc.text
+                header = ""
+            num_body_parts = ceil(token_length / max_tokens)
+            part_length = ceil(len(body) / num_body_parts)
+            body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)]
+            for i, body_part in enumerate(body_parts):
+                new_doc = Document(text=header + body_part.strip(),
+                                   doc_id=f"{doc.doc_id}-{i}",
+                                   embedding=doc.embedding,
+                                   extra_info=doc.extra_info)
+                docs.append(new_doc)
+    return docs
+
+
+def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
+    if not token_check:
+        return documents
+    print("Grouping small documents")
+    try:
+        documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
+    except Exception:
+        print("Grouping failed, try running without token_check")
+    print("Separating large documents")
+    try:
+        documents = split_documents(documents=documents, max_tokens=max_tokens)
+    except Exception:
+        print("Grouping failed, try running without token_check")
+    return documents
--- a/application/prompts/chat_combine_prompt.txt
+++ b/application/prompts/chat_combine_prompt.txt
@@ -0,0 +1,9 @@
+You are a DocsGPT, friendly and helpful AI assistant by Arc53 that provides help with documents. You give thorough answers with code examples if possible.
+Use the following pieces of context to help answer the users question. If its not relevant to the question, provide friendly responses.
+You have access to chat history, and can use it to help answer the question.
+When using code examples, use the following format:
+```(language)
+(code)
+```
+----------------
+{summaries}
--- a/application/prompts/chat_reduce_prompt.txt
+++ b/application/prompts/chat_reduce_prompt.txt
@@ -0,0 +1,3 @@
+Use the following pieces of context to help answer the users question. If its not relevant to the question, respond with "-"
+----------------
+{context}
--- a/application/prompts/combine_prompt.txt
+++ b/application/prompts/combine_prompt.txt
--- a/application/prompts/combine_prompt_hist.txt
+++ b/application/prompts/combine_prompt_hist.txt
--- a/application/prompts/question_prompt.txt
+++ b/application/prompts/question_prompt.txt
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -1,126 +1,106 @@
 aiodns==3.0.0
-aiohttp==3.8.3
+aiohttp==3.8.5
 aiohttp-retry==2.8.3
 aiosignal==1.3.1
-alabaster==0.7.13
-aleph-alpha-client==2.16.0
-anyio==3.6.2
-argilla==1.3.0
+aleph-alpha-client==2.16.1
+amqp==5.1.1
 async-timeout==4.0.2
 attrs==22.2.0
-Babel==2.11.0
-backoff==2.2.1
+billiard==3.6.4.0
 blobfile==2.0.1
-boto3==1.26.82
-botocore==1.29.82
+boto3==1.28.20
+celery==5.2.7
 cffi==1.15.1
-charset-normalizer==2.1.1
+charset-normalizer==3.1.0
 click==8.1.3
-cohere==3.4.0
+click-didyoumean==0.3.0
+click-plugins==1.1.1
+click-repl==0.2.0
+cryptography==41.0.3
 dataclasses-json==0.5.7
 decorator==5.1.1
-deeplake==3.2.12
-Deprecated==1.2.13
 dill==0.3.6
-docutils==0.19
-docx2txt==0.8
+dnspython==2.3.0
+ecdsa==0.18.0
+elasticsearch==8.9.0
 entrypoints==0.4
-escodegen==1.0.10
-esprima==4.0.1
-esutils==1.0.1
-et-xmlfile==1.1.0
 faiss-cpu==1.7.3
 filelock==3.9.0
-Flask==2.2.2
+Flask==2.2.5
+Flask-Cors==3.0.10
 frozenlist==1.3.3
-greenlet==2.0.2
+geojson==2.5.0
 gunicorn==20.1.0
-h11==0.14.0
-httpcore==0.16.3
-httpx==0.23.3
-hub==3.0.1
-huggingface-hub==0.12.0
-humbug==0.2.8
+greenlet==2.0.2
+gpt4all==0.1.7
+huggingface-hub==0.15.1
+humbug==0.3.2
 idna==3.4
-imagesize==1.4.1
 itsdangerous==2.1.2
-javalang==0.13.0
 Jinja2==3.1.2
 jmespath==1.0.1
 joblib==1.2.0
-langchain==0.0.98
+kombu==5.2.4
+langchain==0.0.263
+loguru==0.6.0
 lxml==4.9.2
-manifest-ml==0.1.1
 MarkupSafe==2.1.2
 marshmallow==3.19.0
 marshmallow-enum==1.5.1
-monotonic==1.6
+mpmath==1.3.0
 multidict==6.0.4
 multiprocess==0.70.14
-mypy-extensions==0.4.3
+mypy-extensions==1.0.0
+networkx==3.0
+npx
 nltk==3.8.1
 numcodecs==0.11.0
-numpy==1.23.5
-openai==0.27.0
-openpyxl==3.1.1
+numpy==1.24.2
+openai==0.27.8
 packaging==23.0
-pandas==1.5.3
 pathos==0.3.0
 Pillow==9.4.0
 pox==0.3.2
 ppft==1.7.6.6
+prompt-toolkit==3.0.38
 py==1.11.0
+pyasn1==0.4.8
 pycares==4.3.0
 pycparser==2.21
 pycryptodomex==3.17
-pydantic==1.10.4
-Pygments==2.14.0
+pycryptodome==3.19.0
+pydantic==1.10.5
 PyJWT==2.6.0
+pymongo==4.3.3
+pyowm==3.3.0
 PyPDF2==3.0.1
+PySocks==1.7.1
+pytest
 python-dateutil==2.8.2
-python-docx==0.8.11
-python-dotenv==0.21.1
-python-magic==0.4.27
-python-pptx==0.6.21
+python-dotenv==1.0.0
+python-jose==3.3.0
 pytz==2022.7.1
 PyYAML==6.0
-redis==4.5.1
+redis==4.5.4
 regex==2022.10.31
-requests==2.28.2
+requests==2.31.0
 retry==0.9.2
-rfc3986==1.5.0
-s3transfer==0.6.0
-scikit-learn==1.2.1
-scipy==1.10.0
-sentence-transformers==2.2.2
-sentencepiece==0.1.97
+rsa==4.9
+scikit-learn==1.2.2
+scipy==1.10.1
+sentencepiece
 six==1.16.0
-sniffio==1.3.0
-snowballstemmer==2.2.0
-Sphinx==6.1.3
-sphinxcontrib-applehelp==1.0.4
-sphinxcontrib-devhelp==1.0.2
-sphinxcontrib-htmlhelp==2.0.1
-sphinxcontrib-jsmath==1.0.1
-sphinxcontrib-qthelp==1.0.3
-sphinxcontrib-serializinghtml==1.1.5
 SQLAlchemy==1.4.46
-sqlitedict==2.1.0
-tenacity==8.2.1
+sympy==1.11.1
+tenacity==8.2.2
 threadpoolctl==3.1.0
-tiktoken==0.1.2
-tokenizers==0.13.2
-torch==1.13.1
-torchvision==0.14.1
-tqdm==4.64.1
-transformers==4.26.0
+tiktoken
+tqdm==4.65.0
+transformers==4.30.0
 typer==0.7.0
 typing-inspect==0.8.0
-typing_extensions==4.4.0
-unstructured==0.4.11
+typing_extensions==4.5.0
 urllib3==1.26.14
-Werkzeug==2.2.3
-wrapt==1.14.1
-XlsxWriter==3.0.8
-xxhash==3.2.0
+vine==5.0.0
+wcwidth==0.2.6
 yarl==1.8.2
--- a/application/static/favicon/android-chrome-192x192.png
+++ b/application/static/favicon/android-chrome-192x192.png
--- a/application/static/favicon/android-chrome-512x512.png
+++ b/application/static/favicon/android-chrome-512x512.png
--- a/application/static/favicon/apple-touch-icon.png
+++ b/application/static/favicon/apple-touch-icon.png
--- a/application/static/favicon/favicon-16x16.png
+++ b/application/static/favicon/favicon-16x16.png
--- a/application/static/favicon/favicon-32x32.png
+++ b/application/static/favicon/favicon-32x32.png
--- a/application/static/favicon/site.webmanifest
+++ b/application/static/favicon/site.webmanifest
@@ -1 +0,0 @@
-{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}
--- a/application/static/src/authapi.js
+++ b/application/static/src/authapi.js
@@ -1,19 +0,0 @@
-function resetApiKey() {
-  const modal = document.getElementById("modal");
-  modal.classList.toggle("hidden");
-}
-
-const apiKeyForm = document.getElementById("api-key-form");
-if (apiKeyForm) {
-  apiKeyForm.addEventListener("submit", function(event) {
-    event.preventDefault();
-
-    const apiKeyInput = document.getElementById("api-key-input");
-    const apiKey = apiKeyInput.value;
-
-    localStorage.setItem("apiKey", apiKey);
-
-    apiKeyInput.value = "";
-    modal.classList.toggle("hidden");
-  });
-}
--- a/application/static/src/chat.js
+++ b/application/static/src/chat.js
@@ -1,74 +0,0 @@
-var form = document.getElementById('message-form');
-var errorModal = document.getElementById('error-alert')
-document.getElementById('close').addEventListener('click',()=>{
-    errorModal.classList.toggle('hidden')
-})
-
-
-function submitForm(event){
-    event.preventDefault()
-    var message = document.getElementById("message-input").value;
-    console.log(message.length)
-    if(message.length === 0){
-        return
-    }
-    msg_html = '<div class="bg-blue-500 text-white p-2 rounded-lg mb-2 self-end"><p class="text-sm">'
-    msg_html += message
-    msg_html += '</p></div>'
-    document.getElementById("messages").innerHTML += msg_html;
-    let chatWindow = document.getElementById("messages-container");
-    chatWindow.scrollTop = chatWindow.scrollHeight;
-    document.getElementById("message-input").value = "";
-    document.getElementById("button-submit").innerHTML = '<i class="fa fa-circle-o-notch fa-spin"></i> Thinking...';
-    document.getElementById("button-submit").disabled = true;
-    if (localStorage.getItem('activeDocs') == null) {
-        localStorage.setItem('activeDocs', 'default')
-    }
-
-    
-    fetch('/api/answer', {
-        method: 'POST',
-        headers: {
-            'Content-Type': 'application/json',
-        },
-
-        body: JSON.stringify({question: message,
-            api_key: localStorage.getItem('apiKey'),
-            embeddings_key: localStorage.getItem('apiKey'),
-            history: localStorage.getItem('chatHistory'),
-            active_docs: localStorage.getItem('activeDocs')}),
-    }).then((response)=> response.json())
-    .then(data => {
-            console.log('Success:', data);
-            if(data.error){
-            document.getElementById('text-error').textContent = `Error : ${JSON.stringify(data.message)}`
-            errorModal.classList.toggle('hidden')
-            }
-            if(data.answer){
-            msg_html = '<div class="bg-indigo-500 text-white p-2 rounded-lg mb-2 self-start"><code class="text-sm">'
-            data.answer = data.answer.replace(/\n/g, "<br>");
-            msg_html += data.answer
-            msg_html += '</code></div>'
-            document.getElementById("messages").innerHTML += msg_html;
-            let chatWindow = document.getElementById("messages-container");
-            chatWindow.scrollTop = chatWindow.scrollHeight;
-            }
-            document.getElementById("button-submit").innerHTML = 'Send';
-            document.getElementById("button-submit").disabled = false;
-            let chatHistory = [message, data.answer || ''];
-            localStorage.setItem('chatHistory', JSON.stringify(chatHistory));
-
-            
-
-
-        })
-        .catch((error) => {
-            console.error('Error:', error);
-            // console.log(error);
-            // document.getElementById("button-submit").innerHTML = 'Send';
-            // document.getElementById("button-submit").disabled = false;
-
-        });
-}
-
-window.addEventListener('submit',submitForm)
--- a/application/static/src/choiceChange.js
+++ b/application/static/src/choiceChange.js
@@ -1,15 +0,0 @@
-document.getElementById("select-docs").addEventListener("change", function() {
-localStorage.setItem('activeDocs', this.value)
-     fetch('/api/docs_check', {
-         method: 'POST',
-         headers: {
-             'Content-Type': 'application/json',
-         },
-         body: JSON.stringify({docs: this.value}),
-     }).then(response => response.json()).then(
-            data => {
-                console.log('Success:', data);
-            }
-     )
-});
-
--- a/application/static/src/input.css
+++ b/application/static/src/input.css
@@ -1,37 +0,0 @@
-@tailwind base;
-@tailwind components;
-@tailwind utilities;
-
-
-
-
-@media screen and (max-width: 1024px) {
-  .text-lg {
-    font-size: 3.125rem;
-    margin: 2rem;
-    line-height: inherit;
-  }
-  .text-sm {
-    font-size: 2.5rem;
-    margin: 1.5rem;
-    line-height: inherit;
-  }
-
-}
-
-
-.loader {
-  border: 16px solid #f3f3f3; /* Light grey */
-  border-top: 16px solid #3498db; /* Blue */
-  border-radius: 50%;
-  width: 120px;
-  height: 120px;
-  animation: spin 2s linear infinite;
-}
-
-@keyframes spin {
-  0% { transform: rotate(0deg); }
-  100% { transform: rotate(360deg); }
-}
-
-
--- a/application/templates/index.html
+++ b/application/templates/index.html
@@ -1,195 +0,0 @@
-<!DOCTYPE html>
-<html>
-  <head>
-    <title>DocsGPT 🦖 Preview</title>
-    <link href="{{url_for('static',filename='dist/css/output.css')}}" rel="stylesheet">
-      <link rel="favicon" href="{{ url_for('static', filename='favicon/favicon.ico') }}">
-      <link rel="apple-touch-icon" sizes="180x180" href="{{ url_for('static', filename='favicon/apple-touch-icon.png') }}">
-    <link rel="icon" type="image/png" sizes="32x32" href="{{ url_for('static', filename='favicon/favicon-32x32.png') }}">
-    <link rel="icon" type="image/png" sizes="16x16" href="{{ url_for('static', filename='favicon/favicon-16x16.png') }}">
-    <link rel="manifest" href="{{ url_for('static', filename='favicon//site.webmanifest') }}">
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
-
-
-
-  </head>
-
-
-  <body>
-    
-
-
-    <header class="bg-white p-2 flex justify-between items-center">
-      <h1 class="text-lg font-medium">DocsGPT 🦖 Preview</h1>
-        <div>
-      <a href="https://github.com/arc53/docsgpt" class="text-blue-500 hover:text-blue-800 text-sm">About</a>
-            {% if not api_key_set %}
-      <button class="text-sm text-yellow-500 hover:text-yellow-800" onclick="resetApiKey()">Reset Key</button>
-        {% endif %}
-            </div>
-    </header>
-
-  
- <!-- Alert Info  -->
- <div class="border flex justify-between 
-  w-auto px-4 py-3 rounded relative 
-  hidden" style="background-color: rgb(197, 51, 51);color: white;" id="error-alert" role="alert">
-  <span class="block sm:inline" id="text-error"></span>
-  <strong class="text-xl align-center alert-del" style="cursor: pointer;" id="close">&times;</strong>
-</div>
-
-
-    <div class="lg:flex ml-2 mr-2">
-      <div class="lg:w-3/4 min-h-screen max-h-screen">
-        <div class="w-full flex flex-col h-5/6">
-          <div id="messages-container" style="overflow: auto;" class="sm:max-lg:mb-[12rem]">
-
-            <div id="messages" class="w-full flex flex-col mt-2" >
-              <div class="bg-indigo-500 text-white p-2 rounded-lg mb-2 self-start">
-                <p class="text-sm">Hello, ask me anything about this library. Im here to help</p>
-              </div>
-              <div class="bg-blue-500 text-white p-2 rounded-lg mb-2 self-end">
-                <p class="text-sm">How to merge tables?</p>
-              </div>
-              <div class="bg-indigo-500 text-white p-2 rounded-lg mb-2 self-start">
-                <p class="text-sm">To merge two tables in pandas, you can use the pd.merge() function. The basic syntax is:<br>
-pd.merge(left, right, on, how)<br>
-where left and right are the two tables to merge, on is the column to merge on, and how is the type of merge to perform.<br>
-For example, to merge the two tables df1 and df2 on the column 'key', you can use:<br>
-pd.merge(df1, df2, on='key', how='left')<br>
-This will return a new DataFrame with all the columns from both tables, and only the rows that match the 'key' column. </p>
-              </div>
-
-          </div>
-        </div>
-
-        <div class="fixed bottom-0 w-full mt-4 mb-2 lg:w-3/4">
-        <form id="message-form" autocomplete="off" class="flex items-stretch">
-          <input autocomplete="off" id="message-input" class="bg-white p-2 rounded-lg ml-2 text-sm w-full" type="text" placeholder="Type your message here...">
-          <button id="button-submit" class="bg-blue-500 text-white p-2 rounded-lg ml-2 mr-2 text-sm sm:max-lg:p-5" type="submit">Send</button>
-        </form>
-        </div>
-
-        
-
-
-    </div>
-        </div>
-        <div class="lg:w-1/4 p-2 sm:max-lg:hidden">
-          <p class="text-sm">This is a chatbot that uses the GPT-3, Faiss and <a href="https://github.com/hwchase17/langchain" class="text-blue-500 hover:text-blue-800">LangChain</a> to answer questions</p>
-          <br>
-          <p class="text-sm">The source code is available on <a href="https://github.com/arc53/docsgpt" class="text-blue-500 hover:text-blue-800">Github</a></p><br>
-          <p class="text-sm">Currently It uses python pandas documentation, so it will respond to information relevant to pandas. If you want to train it on different documentation -  <a href="https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation" class="text-blue-500 hover:text-blue-800"> please follow this guide </a></p><br>
-          <p class="text-sm">If you want to launch it on your own server - <a href="https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation" class="text-blue-500 hover:text-blue-800"> follow this guide </a></p><br>
-            <label  class="block mb-2 text-sm font-medium text-gray-900">Select documentation from DocsHUB</label>
-            <select id="select-docs" class="bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2.5">
-              <option selected value="default">Choose documentation</option>
-              <option value="default">Default</option>
-            </select>
-        </div>
-    </div>
-
-  <div class="flex items-center justify-center h-full">
-    
- 
-</div>
-
-
-
-
-{% if not api_key_set %}
-
-<div class="fixed z-10 overflow-y-auto top-0 w-full left-0 show" id="modal">
-  <div class="flex items-center justify-center min-height-100vh pt-4 px-4 pb-20 text-center sm:block sm:p-0">
-    <div class="fixed inset-0 transition-opacity">
-      <div class="absolute inset-0 bg-gray-900 opacity-75" />
-    </div>
-    <span class="hidden sm:inline-block sm:align-middle sm:h-screen">&#8203;</span>
-    <div class=" text-sm inline-block align-center bg-white rounded-lg text-left overflow-hidden shadow-xl transform transition-all sm:my-8 sm:align-middle sm:max-w-lg sm:w-full" role="dialog" aria-modal="true" aria-labelledby="modal-headline">
-       <form id="api-key-form">
-        <div class="bg-white px-4 pt-5 pb-4 sm:p-6 sm:pb-4">
-        <h2>Before you can start using DocsGPT we need you to provide an API key for llm. Currently, we support only OpenAI but soon many more. You can find it <a class="text-blue-500 hover:text-blue-800" href="https://platform.openai.com/account/api-keys">here</a></h2><br>
-        <label>OpenAI API key:</label>
-
-        <input id="api-key-input" type="password" class="w-full bg-gray-100 p-2 mt-2 mb-3" placeholder="Paste you Api Key here">
-
-      </div>
-      <div class="bg-gray-200 px-4 py-3 text-right">
-        <button type="submit" class="py-2 px-4 bg-blue-500 text-white rounded hover:bg-blue-700 mr-2">Save</button>
-
-      </div>
-            </form>
-    </div>
-  </div>
-</div>
-{% endif %}
-
-
-
-      <script>
-          function docsIndex() {
-                // loads latest index from https://raw.githubusercontent.com/arc53/DocsHUB/main/combined.json
-                // and stores it in localStorage
-                fetch('https://d3dg1063dc54p9.cloudfront.net/combined.json')
-                    .then(response => response.json())
-                    .then(data => {
-                        localStorage.setItem("docsIndex", JSON.stringify(data));
-                        localStorage.setItem("docsIndexDate", Date.now());
-                        generateOptions()
-                    }
-
-                )
-
-            }
-          function generateOptions(){
-                docsIndex = localStorage.getItem('docsIndex')
-                // create option on select with id select-docs
-                var select = document.getElementById("select-docs");
-                // convert docsIndex to json
-                docsIndex = JSON.parse(docsIndex)
-                // create option for each key in docsIndex
-                for (var key in docsIndex) {
-                    var option = document.createElement("option");
-                    if (docsIndex[key].name == docsIndex[key].language) {
-                        option.text = docsIndex[key].name + " " + docsIndex[key].version;
-                        option.value = docsIndex[key].name + "/" + ".project" + "/" + docsIndex[key].version + "/{{ embeddings_choice }}/";
-                        if (docsIndex[key].model == "{{ embeddings_choice }}") {
-                            select.add(option);
-                        }
-                    }
-                    else {
-                        option.text = docsIndex[key].name + " " + docsIndex[key].version;
-                        option.value = docsIndex[key].language + "/" + docsIndex[key].name + "/" + docsIndex[key].version + "/{{ embeddings_choice }}/";
-                        if (docsIndex[key].model == "{{ embeddings_choice }}") {
-                            select.add(option);
-                        }
-                    }
-                }
-
-          }
-        {% if not api_key_set %}
-        if (localStorage.getItem('apiKey') === null) {
-            console.log("apiKey is not set")
-            document.getElementById('modal').classList.toggle('hidden')
-        }
-        {% endif %}
-        if (localStorage.getItem('docsIndex') === null) {
-            console.log("docsIndex is not set")
-            docsIndex()
-        }
-        else if (localStorage.getItem("docsIndexDate") < Date.now() - 900000) {
-            console.log("docsIndex is older than 15 minutes")
-            docsIndex()
-        }
-
-        generateOptions()
-
-  </script>
-    {% if not api_key_set %}
-    <script src="{{url_for('static',filename='src/authapi.js')}}"></script>
-    {% endif %}
-  <script src="{{url_for('static',filename='src/chat.js')}}"></script>
-  <script src="{{url_for('static',filename='src/choiceChange.js')}}"></script>
-
-  </body>
-</html>
--- a/application/vectorstore/init.py
+++ b/application/vectorstore/init.py
--- a/application/vectorstore/base.py
+++ b/application/vectorstore/base.py
@@ -0,0 +1,51 @@
+from abc import ABC, abstractmethod
+import os
+from langchain.embeddings import (
+    OpenAIEmbeddings,
+    HuggingFaceEmbeddings,
+    CohereEmbeddings,
+    HuggingFaceInstructEmbeddings,
+)
+from application.core.settings import settings
+
+class BaseVectorStore(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def search(self, *args, **kwargs):
+        pass
+
+    def is_azure_configured(self):
+        return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
+
+    def _get_embeddings(self, embeddings_name, embeddings_key=None):
+        embeddings_factory = {
+            "openai_text-embedding-ada-002": OpenAIEmbeddings,
+            "huggingface_sentence-transformers/all-mpnet-base-v2": HuggingFaceEmbeddings,
+            "huggingface_hkunlp/instructor-large": HuggingFaceInstructEmbeddings,
+            "cohere_medium": CohereEmbeddings
+        }
+        
+        if embeddings_name not in embeddings_factory:
+            raise ValueError(f"Invalid embeddings_name: {embeddings_name}")
+
+        if embeddings_name == "openai_text-embedding-ada-002":
+            if self.is_azure_configured():
+                os.environ["OPENAI_API_TYPE"] = "azure"
+                embedding_instance = embeddings_factory[embeddings_name](
+                    model=settings.AZURE_EMBEDDINGS_DEPLOYMENT_NAME
+                )
+            else:
+                embedding_instance = embeddings_factory[embeddings_name](
+                    openai_api_key=embeddings_key
+                )
+        elif embeddings_name == "cohere_medium":
+            embedding_instance = embeddings_factory[embeddings_name](
+                cohere_api_key=embeddings_key
+            )
+        else:
+            embedding_instance = embeddings_factory[embeddings_name]()
+            
+        return embedding_instance
+
--- a/application/vectorstore/elasticsearch.py
+++ b/application/vectorstore/elasticsearch.py
@@ -0,0 +1,221 @@
+from application.vectorstore.base import BaseVectorStore
+from application.core.settings import settings
+import elasticsearch
+
+class Document(str):
+    """Class for storing a piece of text and associated metadata."""
+
+    def __new__(cls, page_content: str, metadata: dict):
+        instance = super().__new__(cls, page_content)
+        instance.page_content = page_content
+        instance.metadata = metadata
+        return instance
+
+
+
+
+class ElasticsearchStore(BaseVectorStore):
+    _es_connection = None  # Class attribute to hold the Elasticsearch connection
+
+    def __init__(self, path, embeddings_key, index_name=settings.ELASTIC_INDEX):
+        super().__init__()
+        self.path = path.replace("application/indexes/", "").rstrip("/")
+        self.embeddings_key = embeddings_key
+        self.index_name = index_name
+        
+        if ElasticsearchStore._es_connection is None:
+            connection_params = {}
+            if settings.ELASTIC_URL:
+                connection_params["hosts"] = [settings.ELASTIC_URL]
+                connection_params["http_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD)
+            elif settings.ELASTIC_CLOUD_ID:
+                connection_params["cloud_id"] = settings.ELASTIC_CLOUD_ID
+                connection_params["basic_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD)
+            else:
+                raise ValueError("Please provide either elasticsearch_url or cloud_id.")
+
+            
+
+            ElasticsearchStore._es_connection = elasticsearch.Elasticsearch(**connection_params)
+            
+        self.docsearch = ElasticsearchStore._es_connection
+
+    def connect_to_elasticsearch(
+        *,
+        es_url = None,
+        cloud_id = None,
+        api_key = None,
+        username = None,
+        password = None,
+    ):
+        try:
+            import elasticsearch
+        except ImportError:
+            raise ImportError(
+                "Could not import elasticsearch python package. "
+                "Please install it with `pip install elasticsearch`."
+            )
+
+        if es_url and cloud_id:
+            raise ValueError(
+                "Both es_url and cloud_id are defined. Please provide only one."
+            )
+
+        connection_params = {}
+
+        if es_url:
+            connection_params["hosts"] = [es_url]
+        elif cloud_id:
+            connection_params["cloud_id"] = cloud_id
+        else:
+            raise ValueError("Please provide either elasticsearch_url or cloud_id.")
+
+        if api_key:
+            connection_params["api_key"] = api_key
+        elif username and password:
+            connection_params["basic_auth"] = (username, password)
+
+        es_client = elasticsearch.Elasticsearch(
+            **connection_params,
+        )
+        try:
+            es_client.info()
+        except Exception as e:
+            raise e
+
+        return es_client
+
+    def search(self, question, k=2, index_name=settings.ELASTIC_INDEX, *args, **kwargs):
+        embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
+        vector = embeddings.embed_query(question)
+        knn = {
+            "filter": [{"match": {"metadata.store.keyword": self.path}}],
+            "field": "vector",
+            "k": k,
+            "num_candidates": 100,
+            "query_vector": vector,
+        }
+        full_query = {
+            "knn": knn,
+            "query": {
+                "bool": {
+                    "must": [
+                        {
+                            "match": {
+                                "text": {
+                                    "query": question,
+                                }
+                            }
+                        }
+                    ],
+                    "filter": [{"match": {"metadata.store.keyword": self.path}}],
+                }
+            },
+            "rank": {"rrf": {}},
+        }
+        resp = self.docsearch.search(index=self.index_name, query=full_query['query'], size=k, knn=full_query['knn'])
+        # create Documnets objects from the results page_content ['_source']['text'], metadata ['_source']['metadata']
+        doc_list = []
+        for hit in resp['hits']['hits']:
+            
+            doc_list.append(Document(page_content = hit['_source']['text'], metadata = hit['_source']['metadata']))
+        return doc_list
+
+    def _create_index_if_not_exists(
+            self, index_name, dims_length
+        ):
+
+        if self._es_connection.indices.exists(index=index_name):
+            print(f"Index {index_name} already exists.")
+
+        else:
+
+            indexSettings = self.index(
+                dims_length=dims_length,
+            )
+            self._es_connection.indices.create(index=index_name, **indexSettings)
+
+    def index(
+            self,
+            dims_length,
+        ):
+        return {
+            "mappings": {
+                "properties": {
+                    "vector": {
+                        "type": "dense_vector",
+                        "dims": dims_length,
+                        "index": True,
+                        "similarity": "cosine",
+                    },
+                }
+            }
+        }
+
+    def add_texts(
+        self,
+        texts,
+        metadatas = None,
+        ids = None,
+        refresh_indices = True,
+        create_index_if_not_exists = True,
+        bulk_kwargs = None,
+        **kwargs,
+        ):
+        
+        from elasticsearch.helpers import BulkIndexError, bulk
+
+        bulk_kwargs = bulk_kwargs or {}
+        import uuid
+        embeddings = []
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
+        requests = []
+        embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
+
+        vectors = embeddings.embed_documents(list(texts))
+
+        dims_length = len(vectors[0])
+
+        if create_index_if_not_exists:
+            self._create_index_if_not_exists(
+                index_name=self.index_name, dims_length=dims_length
+            )
+
+        for i, (text, vector) in enumerate(zip(texts, vectors)):
+            metadata = metadatas[i] if metadatas else {}
+
+            requests.append(
+                {
+                    "_op_type": "index",
+                    "_index": self.index_name,
+                    "text": text,
+                    "vector": vector,
+                    "metadata": metadata,
+                    "_id": ids[i],
+                }
+            )
+
+
+        if len(requests) > 0:
+            try:
+                success, failed = bulk(
+                    self._es_connection,
+                    requests,
+                    stats_only=True,
+                    refresh=refresh_indices,
+                    **bulk_kwargs,
+                )
+                return ids
+            except BulkIndexError as e:
+                print(f"Error adding texts: {e}")
+                firstError = e.errors[0].get("index", {}).get("error", {})
+                print(f"First error reason: {firstError.get('reason')}")
+                raise e
+
+        else:
+            return []
+
+    def delete_index(self):
+        self._es_connection.delete_by_query(index=self.index_name, query={"match": {
+                                      "metadata.store.keyword": self.path}},)
+
--- a/application/vectorstore/faiss.py
+++ b/application/vectorstore/faiss.py
@@ -0,0 +1,26 @@
+from application.vectorstore.base import BaseVectorStore
+from langchain import FAISS
+from application.core.settings import settings
+
+class FaissStore(BaseVectorStore):
+
+    def __init__(self, path, embeddings_key, docs_init=None):
+        super().__init__()
+        self.path = path
+        if docs_init:
+            self.docsearch = FAISS.from_documents(
+                docs_init, self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
+            )
+        else:
+            self.docsearch = FAISS.load_local(
+                self.path, self._get_embeddings(settings.EMBEDDINGS_NAME, settings.EMBEDDINGS_KEY)
+            )
+
+    def search(self, *args, **kwargs):
+        return self.docsearch.similarity_search(*args, **kwargs)
+
+    def add_texts(self, *args, **kwargs):
+        return self.docsearch.add_texts(*args, **kwargs)
+    
+    def save_local(self, *args, **kwargs):
+        return self.docsearch.save_local(*args, **kwargs)
--- a/application/vectorstore/vector_creator.py
+++ b/application/vectorstore/vector_creator.py
@@ -0,0 +1,16 @@
+from application.vectorstore.faiss import FaissStore
+from application.vectorstore.elasticsearch import ElasticsearchStore
+
+
+class VectorCreator:
+    vectorstores = {
+        'faiss': FaissStore,
+        'elasticsearch':ElasticsearchStore
+    }
+
+    @classmethod
+    def create_vectorstore(cls, type, *args, **kwargs):
+        vectorstore_class = cls.vectorstores.get(type.lower())
+        if not vectorstore_class:
+            raise ValueError(f"No vectorstore class found for type {type}")
+        return vectorstore_class(*args, **kwargs)
--- a/application/worker.py
+++ b/application/worker.py
@@ -0,0 +1,107 @@
+import os
+import shutil
+import string
+import zipfile
+from urllib.parse import urljoin
+
+import nltk
+import requests
+
+from application.core.settings import settings
+from application.parser.file.bulk import SimpleDirectoryReader
+from application.parser.open_ai_func import call_openai_api
+from application.parser.schema.base import Document
+from application.parser.token_func import group_split
+
+try:
+    nltk.download('punkt', quiet=True)
+    nltk.download('averaged_perceptron_tagger', quiet=True)
+except FileExistsError:
+    pass
+
+
+def metadata_from_filename(title):
+    store = title.split('/')
+    store = store[1] + '/' + store[2]
+    return {'title': title, 'store': store}
+
+
+def generate_random_string(length):
+    return ''.join([string.ascii_letters[i % 52] for i in range(length)])
+
+current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+def ingest_worker(self, directory, formats, name_job, filename, user):
+    # directory = 'inputs' or 'temp'
+    # formats = [".rst", ".md"]
+    input_files = None
+    recursive = True
+    limit = None
+    exclude = True
+    # name_job = 'job1'
+    # filename = 'install.rst'
+    # user = 'local'
+    sample = False
+    token_check = True
+    min_tokens = 150
+    max_tokens = 1250
+    full_path = directory + '/' + user + '/' + name_job
+    import sys
+    print(full_path, file=sys.stderr)
+    # check if API_URL env variable is set
+    file_data = {'name': name_job, 'file': filename, 'user': user}
+    response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data)
+    # check if file is in the response
+    print(response, file=sys.stderr)
+    file = response.content
+
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    with open(full_path + '/' + filename, 'wb') as f:
+        f.write(file)
+
+    # check if file is .zip and extract it
+    if filename.endswith('.zip'):
+        with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref:
+            zip_ref.extractall(full_path)
+        os.remove(full_path + '/' + filename)
+
+    self.update_state(state='PROGRESS', meta={'current': 1})
+
+    raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive,
+                                     required_exts=formats, num_files_limit=limit,
+                                     exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
+    raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
+
+    docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
+
+    call_openai_api(docs, full_path, self)
+    self.update_state(state='PROGRESS', meta={'current': 100})
+
+    if sample:
+        for i in range(min(5, len(raw_docs))):
+            print(raw_docs[i].text)
+
+    # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
+    # and send them to the server (provide user and name in form)
+    file_data = {'name': name_job, 'user': user}
+    if settings.VECTOR_STORE == "faiss":
+        files = {'file_faiss': open(full_path + '/index.faiss', 'rb'),
+                'file_pkl': open(full_path + '/index.pkl', 'rb')}
+        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
+        response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
+    else:
+        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
+
+    
+    # delete local
+    shutil.rmtree(full_path)
+
+    return {
+        'directory': directory,
+        'formats': formats,
+        'name_job': name_job,
+        'filename': filename,
+        'user': user,
+        'limited': False
+    }
--- a/application/wsgi.py
+++ b/application/wsgi.py
@@ -1,4 +1,4 @@
-from app import app
+from application.app import app

 if __name__ == "__main__":
-    app.run()
+    app.run(debug=True, port=7091)
--- a/codecov.yml
+++ b/codecov.yml
@@ -0,0 +1,2 @@
+ignore:
+  - "*/tests/*”
--- a/docker-compose-azure.yaml
+++ b/docker-compose-azure.yaml
@@ -0,0 +1,71 @@
+version: "3.9"
+
+services:
+  frontend:
+    build: ./frontend
+    environment:
+      - VITE_API_HOST=http://localhost:7091
+      - VITE_API_STREAMING=$VITE_API_STREAMING
+    ports:
+      - "5173:5173"
+    depends_on:
+      - backend
+
+  backend:
+    build: ./application
+    environment:
+      - API_KEY=$OPENAI_API_KEY
+      - EMBEDDINGS_KEY=$OPENAI_API_KEY
+      - CELERY_BROKER_URL=redis://redis:6379/0
+      - CELERY_RESULT_BACKEND=redis://redis:6379/1
+      - MONGO_URI=mongodb://mongo:27017/docsgpt
+      - OPENAI_API_KEY=$OPENAI_API_KEY
+      - OPENAI_API_BASE=$OPENAI_API_BASE
+      - OPENAI_API_VERSION=$OPENAI_API_VERSION
+      - AZURE_DEPLOYMENT_NAME=$AZURE_DEPLOYMENT_NAME
+      - AZURE_EMBEDDINGS_DEPLOYMENT_NAME=$AZURE_EMBEDDINGS_DEPLOYMENT_NAME
+    ports:
+      - "7091:7091"
+    volumes:
+      - ./application/indexes:/app/application/indexes
+      - ./application/inputs:/app/application/inputs
+      - ./application/vectors:/app/application/vectors
+    depends_on:
+        - redis
+        - mongo
+
+  worker:
+    build: ./application
+    command: celery -A application.app.celery worker -l INFO
+    environment:
+      - API_KEY=$OPENAI_API_KEY
+      - EMBEDDINGS_KEY=$OPENAI_API_KEY
+      - CELERY_BROKER_URL=redis://redis:6379/0
+      - CELERY_RESULT_BACKEND=redis://redis:6379/1
+      - MONGO_URI=mongodb://mongo:27017/docsgpt
+      - API_URL=http://backend:7091
+      - OPENAI_API_KEY=$OPENAI_API_KEY
+      - OPENAI_API_BASE=$OPENAI_API_BASE
+      - OPENAI_API_VERSION=$OPENAI_API_VERSION
+      - AZURE_DEPLOYMENT_NAME=$AZURE_DEPLOYMENT_NAME
+      - AZURE_EMBEDDINGS_DEPLOYMENT_NAME=$AZURE_EMBEDDINGS_DEPLOYMENT_NAME
+    depends_on:
+        - redis
+        - mongo
+
+  redis:
+    image: redis:6-alpine
+    ports:
+      - 6379:6379
+
+  mongo:
+    image: mongo:6
+    ports:
+      - 27017:27017
+    volumes:
+      - mongodb_data_container:/data/db
+
+
+
+volumes:
+  mongodb_data_container:
--- a/docker-compose-dev.yaml
+++ b/docker-compose-dev.yaml
@@ -0,0 +1,20 @@
+version: "3.9"
+
+services:
+
+  redis:
+    image: redis:6-alpine
+    ports:
+      - 6379:6379
+
+  mongo:
+    image: mongo:6
+    ports:
+      - 27017:27017
+    volumes:
+      - mongodb_data_container:/data/db
+
+
+
+volumes:
+  mongodb_data_container:
--- a/docker-compose-local.yaml
+++ b/docker-compose-local.yaml
@@ -0,0 +1,26 @@
+version: "3.9"
+
+services:
+  frontend:
+    build: ./frontend
+    environment:
+      - VITE_API_HOST=http://localhost:7091
+      - VITE_API_STREAMING=$VITE_API_STREAMING
+      - VITE_EMBEDDINGS_NAME=$EMBEDDINGS_NAME
+    ports:
+      - "5173:5173"
+
+  redis:
+    image: redis:6-alpine
+    ports:
+      - 6379:6379
+
+  mongo:
+    image: mongo:6
+    ports:
+      - 27017:27017
+    volumes:
+      - mongodb_data_container:/data/db
+
+volumes:
+  mongodb_data_container:
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -4,12 +4,57 @@ services:
  frontend:
    build: ./frontend
    environment:
-      - API_HOST=http://backend:5001
+      - VITE_API_HOST=http://localhost:7091
+      - VITE_API_STREAMING=$VITE_API_STREAMING
    ports:
      - "5173:5173"
+    depends_on:
+      - backend

  backend:
    build: ./application
+    environment:
+      - API_KEY=$OPENAI_API_KEY
+      - EMBEDDINGS_KEY=$OPENAI_API_KEY
+      - CELERY_BROKER_URL=redis://redis:6379/0
+      - CELERY_RESULT_BACKEND=redis://redis:6379/1
+      - MONGO_URI=mongodb://mongo:27017/docsgpt
+      - SELF_HOSTED_MODEL=$SELF_HOSTED_MODEL
    ports:
-      - "5001:5001"
+      - "7091:7091"
+    volumes:
+      - ./application/indexes:/app/application/indexes
+      - ./application/inputs:/app/application/inputs
+      - ./application/vectors:/app/application/vectors
+    depends_on:
+      - redis
+      - mongo

+  worker:
+    build: ./application
+    command: celery -A application.app.celery worker -l INFO
+    environment:
+      - API_KEY=$OPENAI_API_KEY
+      - EMBEDDINGS_KEY=$OPENAI_API_KEY
+      - CELERY_BROKER_URL=redis://redis:6379/0
+      - CELERY_RESULT_BACKEND=redis://redis:6379/1
+      - MONGO_URI=mongodb://mongo:27017/docsgpt
+      - API_URL=http://backend:7091
+    depends_on:
+      - redis
+      - mongo
+
+  redis:
+    image: redis:6-alpine
+    ports:
+      - 6379:6379
+
+  mongo:
+    image: mongo:6
+    ports:
+      - 27017:27017
+    volumes:
+      - mongodb_data_container:/data/db
+
+volumes:
+  mongodb_data_container:
--- a/docs/README.md
+++ b/docs/README.md
@@ -0,0 +1 @@
+# nextra-docsgpt
--- a/docs/next.config.js
+++ b/docs/next.config.js
@@ -0,0 +1,9 @@
+const withNextra = require('nextra')({
+    theme: 'nextra-theme-docs',
+    themeConfig: './theme.config.jsx'
+  })
+   
+  module.exports = withNextra()
+   
+  // If you have other Next.js configurations, you can pass them as the parameter:
+  // module.exports = withNextra({ /* other next.js config */ })
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
--- a/docs/package.json
+++ b/docs/package.json
@@ -0,0 +1,11 @@
+{
+  "dependencies": {
+    "@vercel/analytics": "^1.0.2",
+    "docsgpt": "^0.2.4",
+    "next": "^13.4.19",
+    "nextra": "^2.12.3",
+    "nextra-theme-docs": "^2.12.3",
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0"
+  }
+}
--- a/docs/pages/Deploying/Hosting-the-app.md
+++ b/docs/pages/Deploying/Hosting-the-app.md
@@ -0,0 +1,112 @@
+# Self-hosting DocsGPT on Amazon Lightsail
+
+Here's a step-by-step guide on how to setup an Amazon Lightsail instance to host DocsGPT.
+
+## Configuring your instance
+
+(If you know how to create a Lightsail instance, you can skip to the recommended configuration part by clicking here)
+
+### 1. Create an account or login to https://lightsail.aws.amazon.com
+
+### 2. Click on "Create instance"
+
+### 3. Create your instance
+
+The first step is to select the "Instance location". In most cases there's no need to switch locations as the default one will work well.
+
+After that it is time to pick your Instance Image. We recommend using "Linux/Unix" as the image and "Ubuntu 20.04 LTS" for Operating System.
+
+As for instance plan, it'll vary depending on your unique demands, but a "1 GB, 1vCPU, 40GB SSD and 2TB transfer" setup should cover most scenarios.
+
+Lastly, Identify your instance by giving it a unique name and then hit "Create instance".
+
+PS: Once you create your instance, it'll likely take a few minutes for the setup to be completed.
+
+#### The recommended configuration is as follows:
+
+- Ubuntu 20.04 LTS
+- 1GB RAM
+- 1vCPU
+- 40GB SSD Hard Drive
+- 2TB transfer
+
+### Connecting to your the newly created instance
+
+Your instance will be ready for use a few minutes after being created. To access, just open it up and click on "Connect using SSH".
+
+#### Clone the repository
+
+A terminal window will pop up, and the first step will be to clone DocsGPT git repository.
+
+`git clone https://github.com/arc53/DocsGPT.git`
+
+#### Download the package information
+
+Once it has finished cloning the repository, it is time to download the package information from all sources. To do so simply enter the following command:
+
+`sudo apt update`
+
+#### Install Docker and Docker Compose
+
+DocsGPT backend and worker use python, Frontend is written on React and the whole application is containerized using Docker. To install Docker and Docker Compose, enter the following commands:
+
+`sudo apt install docker.io`
+
+And now install docker-compose:
+
+`sudo apt install docker-compose`
+
+#### Access the DocsGPT folder
+
+Enter the following command to access the folder in which DocsGPT docker-compose file is.
+
+`cd DocsGPT/`
+
+#### Prepare the environment
+
+Inside the DocsGPT folder create a .env file and copy the contents of .env_sample into it.
+
+`nano .env`
+
+Make sure your .env file looks like this:
+
+```
+OPENAI_API_KEY=(Your OpenAI API key)
+VITE_API_STREAMING=true
+SELF_HOSTED_MODEL=false
+```
+
+To save the file, press CTRL+X, then Y and then ENTER.
+
+Next we need to set a correct IP for our Backend. To do so, open the docker-compose.yml file:
+
+`nano docker-compose.yml`
+
+And change this line 7 `VITE_API_HOST=http://localhost:7091`
+to this `VITE_API_HOST=http://<your instance public IP>:7091`
+
+This will allow the frontend to connect to the backend.
+
+#### Running the app
+
+You're almost there! Now that all the necessary bits and pieces have been installed, it is time to run the application. To do so, use the following command:
+
+`sudo docker-compose up -d`
+
+If you launch it for the first time it will take a few minutes to download all the necessary dependencies and build.
+
+Once this is done you can go ahead and close the terminal window.
+
+#### Enabling ports 
+
+Before you being able to access your live instance, you must first enable the port which it is using.
+
+Open your Lightsail instance and head to "Networking".
+
+Then click on "Add rule" under "IPv4 Firewall", enter 5173 as your your port and hit "Create". 
+Repeat the process for port 7091.
+
+#### Access your instance
+
+Your instance will now be available under your Public IP Address and port 5173. Enjoy!
+
--- a/docs/pages/Deploying/Quickstart.md
+++ b/docs/pages/Deploying/Quickstart.md
@@ -0,0 +1,23 @@
+## Launching Web App
+Note: Make sure you have docker installed
+
+1. Open download this repository with `git clone https://github.com/arc53/DocsGPT.git`
+2. Create .env file in your root directory and set your `OPENAI_API_KEY` with your openai api key
+3. Run `docker-compose build && docker-compose up`
+4. Navigate to `http://localhost:5173/`
+
+To stop just run Ctrl + C
+
+### Chrome Extension
+
+To install the Chrome extension:
+
+1. In the DocsGPT GitHub repository, click on the "Code" button and select Download ZIP
+2. Unzip the downloaded file to a location you can easily access
+3. Open the Google Chrome browser and click on the three dots menu (upper right corner)
+4. Select "More Tools" and then "Extensions"
+5. Turn on the "Developer mode" switch in the top right corner of the Extensions page
+6. Click on the "Load unpacked" button
+7. Select the "Chrome" folder where the DocsGPT files have been unzipped (docsgpt-main > extensions > chrome)
+8. The extension should now be added to Google Chrome and can be managed on the Extensions page
+9. To disable or remove the extension, simply turn off the toggle switch on the extension card or click the "Remove" button.
--- a/docs/pages/Deploying/_meta.json
+++ b/docs/pages/Deploying/_meta.json
@@ -0,0 +1,10 @@
+{
+  "Hosting-the-app": {
+    "title": "☁️ Hosting DocsGPT",
+    "href": "/Deploying/Hosting-the-app"
+  },
+  "Quickstart": {
+    "title": "⚡️Quickstart",
+    "href": "/Deploying/Quickstart"
+  }
+}
--- a/docs/pages/Developing/API-docs.md
+++ b/docs/pages/Developing/API-docs.md
@@ -0,0 +1,153 @@
+App currently has two main api endpoints:
+
+### /api/answer 
+Its a POST request that sends a JSON in body with 4 values. Here is a JavaScript fetch example
+It will receive an answer for a user provided question
+
+```js
+// answer (POST http://127.0.0.1:5000/api/answer)
+fetch("http://127.0.0.1:5000/api/answer", {
+      "method": "POST",
+      "headers": {
+            "Content-Type": "application/json; charset=utf-8"
+      },
+      "body": JSON.stringify({"question":"Hi","history":null,"api_key":"OPENAI_API_KEY","embeddings_key":"OPENAI_API_KEY",
+      "active_docs": "javascript/.project/ES2015/openai_text-embedding-ada-002/"})
+})
+.then((res) => res.text())
+.then(console.log.bind(console))
+```
+
+In response you will get a json document like this one:
+
+```json
+{
+  "answer": " Hi there! How can I help you?\n",
+  "query": "Hi",
+  "result": " Hi there! How can I help you?\nSOURCES:"
+}
+```
+
+### /api/docs_check
+It will make sure documentation is loaded on a server (just run it every time user is switching between libraries (documentations)
+Its a POST request that sends a JSON in body with 1 value. Here is a JavaScript fetch example
+
+```js
+// answer (POST http://127.0.0.1:5000/api/docs_check)
+fetch("http://127.0.0.1:5000/api/docs_check", {
+      "method": "POST",
+      "headers": {
+            "Content-Type": "application/json; charset=utf-8"
+      },
+      "body": JSON.stringify({"docs":"javascript/.project/ES2015/openai_text-embedding-ada-002/"})
+})
+.then((res) => res.text())
+.then(console.log.bind(console))
+```
+
+In response you will get a json document like this one:
+```json
+{
+  "status": "exists"
+}
+```
+
+
+### /api/combine
+Provides json that tells UI which vectors are available and where they are located with a simple get request
+
+Respsonse will include:
+date, description, docLink, fullName, language, location (local or docshub), model, name, version
+
+Example of json in Docshub and local:
+<img width="295" alt="image" src="https://user-images.githubusercontent.com/15183589/224714085-f09f51a4-7a9a-4efb-bd39-798029bb4273.png">
+
+
+### /api/upload
+Uploads file that needs to be trained, response is json with task id, which can be used to check on tasks progress
+HTML example:
+
+```html
+<form action="/api/upload" method="post" enctype="multipart/form-data" class="mt-2">
+                <input type="file" name="file" class="py-4" id="file-upload">
+                <input type="text" name="user" value="local" hidden>
+                <input type="text" name="name" placeholder="Name:">
+
+
+              <button type="submit" class="py-2 px-4 text-white bg-blue-500 rounded-md hover:bg-blue-600 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-blue-500">
+                Upload
+              </button>
+            </form>
+```
+
+Response:
+```json
+{
+  "status": "ok",
+  "task_id": "b2684988-9047-428b-bd47-08518679103c"
+}
+
+```
+
+### /api/task_status
+Gets task status (task_id) from /api/upload
+```js
+// Task status (Get http://127.0.0.1:5000/api/task_status)
+fetch("http://localhost:5001/api/task_status?task_id=b2d2a0f4-387c-44fd-a443-e4fe2e7454d1", {
+      "method": "GET",
+      "headers": {
+            "Content-Type": "application/json; charset=utf-8"
+      },
+})
+.then((res) => res.text())
+.then(console.log.bind(console))
+```
+
+Responses:
+There are two types of responses:
+1. while task it still running, where "current" will show progress from 0 - 100
+```json
+{
+  "result": {
+    "current": 1
+  },
+  "status": "PROGRESS"
+}
+```
+
+2. When task is completed
+```json
+{
+  "result": {
+    "directory": "temp",
+    "filename": "install.rst",
+    "formats": [
+      ".rst",
+      ".md",
+      ".pdf"
+    ],
+    "name_job": "somename",
+    "user": "local"
+  },
+  "status": "SUCCESS"
+}
+```
+
+### /api/delete_old
+deletes old vecotstores
+```js
+// Task status (GET http://127.0.0.1:5000/api/docs_check)
+fetch("http://localhost:5001/api/task_status?task_id=b2d2a0f4-387c-44fd-a443-e4fe2e7454d1", {
+      "method": "GET",
+      "headers": {
+            "Content-Type": "application/json; charset=utf-8"
+      },
+})
+.then((res) => res.text())
+.then(console.log.bind(console))
+```
+response:
+
+```json
+{ "status": "ok" }
+```
--- a/docs/pages/Developing/_meta.json
+++ b/docs/pages/Developing/_meta.json
@@ -0,0 +1,6 @@
+{
+  "API-docs": {
+    "title": "🗂️️ API-docs",
+    "href": "/Developing/API-docs"
+  }
+}
--- a/docs/pages/Extensions/Chatwoot-extension.md
+++ b/docs/pages/Extensions/Chatwoot-extension.md
@@ -0,0 +1,29 @@
+### To start chatwoot extension:
+1. Prepare and start the DocsGPT itself (load your documentation too) 
+Follow our [wiki](https://github.com/arc53/DocsGPT/wiki) to start it and to [ingest](https://github.com/arc53/DocsGPT/wiki/How-to-train-on-other-documentation) data
+2. Go to chatwoot, Navigate to your profile (bottom left), click on profile settings, scroll to the bottom and copy Access Token 
+2. Navigate to `/extensions/chatwoot`. Copy .env_sample and create .env file
+3. Fill in the values
+
+```
+docsgpt_url=<docsgpt_api_url>
+chatwoot_url=<chatwoot_url>
+docsgpt_key=<openai_api_key or other llm key>
+chatwoot_token=<from part 2>
+```
+
+4. start with `flask run` command
+
+If you want for bot to stop responding to questions for a specific user or session just add label `human-requested` in your conversation
+
+
+### Optional (extra validation)
+In app.py uncomment lines 12-13 and 71-75
+
+in your .env file add:
+
+`account_id=(optional) 1 `
+
+`assignee_id=(optional) 1`
+
+Those are chatwoot values and will allow you to check if you are responding to correct widget and responding to questions assigned to specific user
--- a/docs/pages/Extensions/_meta.json
+++ b/docs/pages/Extensions/_meta.json
@@ -0,0 +1,10 @@
+{
+  "Chatwoot-extension": {
+    "title": "💬️ Chatwoot Extension",
+    "href": "/Extensions/Chatwoot-extension"
+  },
+  "react-widget": {
+      "title": "🏗️ Widget setup",
+      "href": "/Extensions/react-widget"
+    }
+}
--- a/docs/pages/Extensions/react-widget.md
+++ b/docs/pages/Extensions/react-widget.md
@@ -0,0 +1,37 @@
+### How to set up react docsGPT widget on your website:
+
+### Installation
+Got to your project and install a new dependency: `npm install docsgpt`
+
+### Usage
+Go to your project and in the file where you want to use the widget import it: 
+```js
+import { DocsGPTWidget } from "docsgpt";
+import "docsgpt/dist/style.css";
+```
+
+
+Then you can use it like this: `<DocsGPTWidget />`
+
+DocsGPTWidget takes 3 props:
+- `apiHost` - url of your DocsGPT API
+- `selectDocs` - documentation that you want to use for your widget (eg. `default` or `local/docs1.zip`)
+- `apiKey` - usually its empty
+
+### How to use DocsGPTWidget with [Nextra](https://nextra.site/) (Next.js + MDX)
+Install you widget as described above and then go to your `pages/` folder and create a new file `_app.js` with the following content:
+```js
+import { DocsGPTWidget } from "docsgpt";
+import "docsgpt/dist/style.css";
+
+export default function MyApp({ Component, pageProps }) {
+    return (
+        <>
+            <Component {...pageProps} />
+            <DocsGPTWidget selectDocs="local/docsgpt-sep.zip/"/>
+        </>
+    )
+}
+```
+
+
--- a/docs/pages/Guides/Customising-prompts.md
+++ b/docs/pages/Guides/Customising-prompts.md
@@ -0,0 +1,4 @@
+## To customise a main prompt navigate to `/application/prompt/combine_prompt.txt`
+
+You can try editing it to see how the model responds.
+
--- a/docs/pages/Guides/How-to-train-on-other-documentation.md
+++ b/docs/pages/Guides/How-to-train-on-other-documentation.md
@@ -0,0 +1,60 @@
+## How to train on other documentation
+This AI can use any documentation, but first it needs to be prepared for similarity search. 
+
+![video-example-of-how-to-do-it](https://d3dg1063dc54p9.cloudfront.net/videos/how-to-vectorise.gif)
+
+Start by going to 
+`/scripts/` folder
+
+If you open this file you will see that it uses RST files from the folder to create a `index.faiss` and `index.pkl`. 
+
+It currently uses OPEN_AI to create vector store, so make sure your documentation is not too big. Pandas cost me around 3-4$
+
+You can usually find documentation on github in docs/ folder for most open-source projects.
+
+### 1. Find documentation in .rst/.md and create a folder with it in your scripts directory
+Name it `inputs/`  
+Put all your .rst/.md files in there  
+The search is recursive, so you don't need to flatten them
+
+If there are no .rst/.md files just convert whatever you find to txt and feed it. (don't forget to change the extension in script)
+
+### 2. Create .env file in `scripts/` folder
+And write your OpenAI API key inside
+`OPENAI_API_KEY=<your-api-key>`
+
+### 3. Run scripts/ingest.py
+
+`python ingest.py ingest`
+
+It will tell you how much it will cost
+
+### 4. Move `index.faiss` and `index.pkl` generated in `scripts/output` to `application/` folder. 
+
+
+### 5. Run web app
+Once you run it will use new context that is relevant to your documentation
+Make sure you select default in the dropdown in the UI
+
+## Customisation 
+You can learn more about options while running ingest.py by running:
+
+`python ingest.py --help`
+|              Options             |                                                                                                                                |
+|:--------------------------------:|:------------------------------------------------------------------------------------------------------------------------------:|
+|            **ingest**            | Runs 'ingest' function converting documentation to to Faiss plus Index format                                                  |
+| --dir TEXT                       | List of paths to directory for index creation. E.g. --dir inputs --dir inputs2 [default: inputs]                               |
+| --file TEXT                      | File paths to use (Optional; overrides directory) E.g. --files inputs/1.md --files inputs/2.md                                 |
+| --recursive / --no-recursive     | Whether to recursively search in subdirectories [default: recursive]                                                           |
+| --limit INTEGER                  | Maximum number of files to read                                                                                                |
+| --formats TEXT                   | List of required extensions (list with .) Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html [default: .rst, .md] |
+| --exclude / --no-exclude         | Whether to exclude hidden files (dotfiles) [default: exclude]                                                                  |
+| -y, --yes                        | Whether to skip price confirmation                                                                                             |
+| --sample / --no-sample           | Whether to output sample of the first 5 split documents. [default: no-sample]                                                  |
+| --token-check / --no-token-check | Whether to group small documents and split large. Improves semantics. [default: token-check]                                   |
+| --min_tokens INTEGER             | Minimum number of tokens to not group. [default: 150]                                                                          |
+| --max_tokens INTEGER             | Maximum number of tokens to not split. [default: 2000]                                                                         |
+|                                  |                                                                                                                                |
+|            **convert**           | Creates documentation in .md format from source code                                                                           |
+| --dir TEXT                       | Path to a directory with source code. E.g. --dir inputs [default: inputs]                                                      |
+| --formats TEXT                   | Source code language from which to create documentation. Supports py, js and java.  E.g. --formats py [default: py]            |
--- a/docs/pages/Guides/How-to-use-different-LLM.md
+++ b/docs/pages/Guides/How-to-use-different-LLM.md
@@ -0,0 +1,32 @@
+Fortunately there are many providers for LLM's and some of them can even be ran locally
+
+There are two models used in the app:
+1. Embeddings
+2. Text generation
+
+By default we use OpenAI's models but if you want to change it or even run it locally, its very simple!
+
+### Go to .env file or set environment variables:
+
+`LLM_NAME=<your Text generation>`
+
+`API_KEY=<api_key for Text generation>`
+
+`EMBEDDINGS_NAME=<llm for embeddings>`
+
+`EMBEDDINGS_KEY=<api_key for embeddings>`
+
+`VITE_API_STREAMING=<true or false (true if using openai, false for all others)>`
+
+You dont need to provide keys if you are happy with users providing theirs, so make sure you set LLM_NAME and EMBEDDINGS_NAME
+
+Options:  
+LLM_NAME (openai, manifest, cohere, Arc53/docsgpt-14b, Arc53/docsgpt-7b-falcon)  
+EMBEDDINGS_NAME (openai_text-embedding-ada-002, huggingface_sentence-transformers/all-mpnet-base-v2, huggingface_hkunlp/instructor-large, cohere_medium)
+
+That's it!
+
+### Hosting everything locally and privately (for using our optimised open-source models)
+If you are working with important data and dont want anything to leave your premises.
+
+Make sure you set SELF_HOSTED_MODEL as true in you .env variable and for your LLM_NAME you can use anything that's on Huggingface 
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}`