Compare commits

...

108 Commits
0.2.0 ... 0.3.0

Author SHA1 Message Date
Alex
577d58c92b less token less issues 2023-06-03 16:31:10 +01:00
Alex
899777632b Update README.md 2023-06-03 16:09:10 +01:00
Alex
bbf55ca46e Merge pull request #250 from tardigrde/main 2023-06-01 14:56:48 +01:00
Alex
3f88b04c4a Update app.py 2023-05-31 23:49:41 +01:00
Alex
f8910ba136 Added history in streaming convo + fixed little bug with message margins on loading state 2023-05-31 23:47:16 +01:00
Alex
6c95d8b13e Merge pull request #251 from arc53/feature/streaming
Feature/streaming
2023-05-31 22:30:57 +01:00
Alex
e6bccaaf4e Update app.py 2023-05-31 22:20:47 +01:00
Alex
3b8039a580 Merge branch 'main' into feature/streaming 2023-05-31 22:15:53 +01:00
Alex
fae3f55010 Working streaming 2023-05-31 17:44:20 +01:00
Alex
20c877f75b working fe 2023-05-31 15:42:17 +01:00
Alex
8380858a82 some fixes 2023-05-30 20:00:41 +01:00
Alex
d2358c399d working version 2023-05-30 19:43:06 +01:00
Alex
c3af8a77af working streams 2023-05-29 17:55:43 +01:00
Levente Csőke
bc5a0b030b Update .env-template to OPENAI_API_KEY 2023-05-26 08:57:11 +02:00
Alex
0b94f1717f Merge pull request #246 from arc53/feature/gpt4all
Feature/gpt4all
2023-05-25 19:42:20 +01:00
Alex
aaa1249a41 model fix + env var 2023-05-25 19:33:37 +01:00
Alex
ffaa22c49b reverse history order to use latest history firts
Co-Authored-By: Pavel <32868631+pabik@users.noreply.github.com>
2023-05-25 16:40:11 +01:00
Alex
0b78480977 init 2023-05-25 15:14:47 +01:00
Alex
6b6737613a Merge pull request #243 from nazihkalo/main
updating the bulk ingest file metadata logic
2023-05-20 16:02:32 +01:00
Nazih Kalo
da5d62cc1c updating the bulk ingest file metadata to account for parsers that output lists 2023-05-19 10:29:18 -07:00
Alex
6a68b63192 history fix 2023-05-19 13:09:41 +01:00
Alex
ff2e79fe7b streaming experiments 2023-05-18 23:52:59 +01:00
Alex
1800e51b19 Merge pull request #241 from arc53/feature/history
Feature/history
2023-05-18 18:50:35 +01:00
Alex
ba9c505249 accidentaly deleted frontend container 2023-05-18 18:45:15 +01:00
Alex
bc9f1c17ed History
Co-Authored-By: riccardofresi <89981746+riccardofresi@users.noreply.github.com>
2023-05-18 18:42:23 +01:00
Alex
74845aed64 history init 2023-05-18 14:27:13 +01:00
Alex
e49dd0cc6a metadata on ingestion 2023-05-17 21:41:24 +01:00
Alex
27c45ae24a Merge pull request #236 from larinam/fixbuild_github_token
fix workflow: upgrade "build and push" action version to the latest
2023-05-16 12:04:27 +01:00
Anton Larin
364a14adaf fix workflow: upgrade "build and push" action version to the latest 2023-05-16 08:02:13 +02:00
Alex
5c560b1dd5 Merge pull request #235 from larinam/fixbuild_github_token
fix workflow: adjust permissions according to documentation
2023-05-15 23:17:53 +01:00
Anton Larin
28b8b88332 fix workflow: adjust permissions according to documentation
https://docs.github.com/en/packages/managing-github-packages-using-github-actions-workflows/publishing-and-installing-a-package-with-github-actions#publishing-a-package-using-an-action
2023-05-15 21:22:06 +02:00
Alex
e39ef0cc9e Merge pull request #234 from larinam/fixbuild_github_token
fix workflow: login to GHCR according to the GH documentation
2023-05-15 18:17:48 +01:00
Anton Larin
8098d3fec8 fix workflow nad login to GHCR according to the GH documentation
https://docs.github.com/en/packages/managing-github-packages-using-github-actions-workflows/publishing-and-installing-a-package-with-github-actions
2023-05-15 18:55:40 +02:00
Alex
059ffe09ea Merge pull request #232 from larinam/lint
Lint
2023-05-15 13:53:09 +01:00
Alex
36a845c29e Merge pull request #231 from larinam/main
Proper PEP8 formatting
2023-05-15 13:45:52 +01:00
GH Action - Upstream Sync
ce6f0dab56 Merge branch 'main' of https://github.com/arc53/DocsGPT 2023-05-15 12:05:18 +00:00
Alex
f200ab10a4 Merge pull request #233 from arc53/dependabot/pip/scripts/flask-2.2.5
Bump flask from 2.2.2 to 2.2.5 in /scripts
2023-05-15 12:50:30 +01:00
Alex
3001688e0e Update requirements.txt 2023-05-15 12:46:39 +01:00
GH Action - Upstream Sync
a73774099e Merge branch 'main' of https://github.com/arc53/DocsGPT 2023-05-15 11:03:45 +00:00
dependabot[bot]
b28676d52c Bump flask from 2.2.2 to 2.2.5 in /scripts
Bumps [flask](https://github.com/pallets/flask) from 2.2.2 to 2.2.5.
- [Release notes](https://github.com/pallets/flask/releases)
- [Changelog](https://github.com/pallets/flask/blob/main/CHANGES.rst)
- [Commits](https://github.com/pallets/flask/compare/2.2.2...2.2.5)

---
updated-dependencies:
- dependency-name: flask
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-05-15 11:00:19 +00:00
Alex
eef012b4d1 Merge pull request #225 from arc53/dependabot/pip/application/flask-2.3.2
Bump flask from 2.2.3 to 2.3.2 in /application
2023-05-15 11:58:54 +01:00
Alex
1417a1c020 Update requirements.txt 2023-05-15 11:49:41 +01:00
Anton Larin
962becb9a5 Linting
* validate python formatting on every build with Ruff
* fix lint warnings
2023-05-13 10:36:17 +02:00
Anton Larin
168648e789 Proper PEP8 formatting 2023-05-12 12:02:25 +02:00
Alex
7f56f57778 better markdown styling 2023-05-06 15:22:23 +01:00
Alex
6cadddc2fc Merge pull request #223 from Zillibub/main
Moved env variables to the pydantic settings file
2023-05-02 11:07:52 +01:00
dependabot[bot]
15fd54eac4 Bump flask from 2.2.3 to 2.3.2 in /application
Bumps [flask](https://github.com/pallets/flask) from 2.2.3 to 2.3.2.
- [Release notes](https://github.com/pallets/flask/releases)
- [Changelog](https://github.com/pallets/flask/blob/main/CHANGES.rst)
- [Commits](https://github.com/pallets/flask/compare/2.2.3...2.3.2)

---
updated-dependencies:
- dependency-name: flask
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-05-02 00:27:05 +00:00
Serj
31350e6302 Set celery and mongo urls as default 2023-04-30 11:03:09 +01:00
Serj
8742cdae0a Refactored url join 2023-04-30 10:46:52 +01:00
Serj
4efcb388ff Added settings usage to the worker 2023-04-29 15:58:02 +01:00
Serj
2d92e95c8a Added settings usage to the worker 2023-04-29 15:56:32 +01:00
Serj
47e5d5684a Replace other env variables in the file 2023-04-29 15:50:02 +01:00
Serj
b723e14d98 Added embeddings name variable 2023-04-29 15:46:09 +01:00
Serj
c9d24b8f42 Added llm model variable 2023-04-29 15:44:47 +01:00
Serj
43622e7ab1 Added settings file 2023-04-29 15:40:55 +01:00
Serge Kozloff
5cfc185ba5 Merge pull request #2 from arc53/main
d
2023-04-29 15:39:46 +01:00
Alex
4be2635fbe Merge pull request #221 from darth-pika-hu/main
Create setup.sh
2023-04-27 21:36:29 +01:00
Darth Pika
0beafb8391 Update setup.sh
This script includes the necessary changes to use container linking and updated environment variables for the `backend` and `worker` containers.

Make sure you have the `./frontend` and `./application` directories in the correct locations before running the script.
2023-04-27 12:39:03 -07:00
Darth Pika
1d2654b9fa Update setup.sh
Create required directories on the host machine if they don't exist.
2023-04-27 12:02:11 -07:00
Darth Pika
a4bc3673e7 Create setup.sh
Added a bash script to help with installation issues.
2023-04-27 11:40:25 -07:00
Alex
fa080537e8 Merge pull request #220 from Zillibub/main
Updated readme for development run
2023-04-27 12:27:20 +01:00
Serj
bdf67a7db7 Added dev docker compose file 2023-04-26 19:05:50 +01:00
Serge Kozloff
db4cdc901c Merge pull request #1 from arc53/main
t
2023-04-26 18:55:39 +01:00
Serj
16a540b89b Expand readme and added port in wsgi 2023-04-26 18:54:59 +01:00
Alex
e00ec9ac3f Update chat_combine_prompt.txt 2023-04-26 15:01:46 +01:00
Alex
fc760afdfc Update chat_combine_prompt.txt 2023-04-26 14:54:26 +01:00
Alex
cb47bcdb0e Update ConversationBubble.tsx 2023-04-26 13:35:05 +01:00
Alex
8d62559ca8 Merge pull request #219 from arc53/feature/code-highlighting
code highlighting
2023-04-26 10:30:39 +01:00
Alex
dbe9c4dc18 init 2023-04-25 17:01:44 +01:00
Serj
1609b4562d Added mongo db start 2023-04-24 19:22:42 +01:00
Serj
b6cadb1d65 Removed spaces 2023-04-24 18:46:05 +01:00
Serj
7aafac5b5e Expanded developer start a little bit 2023-04-24 18:39:53 +01:00
Pavel
36f0aacb19 Merge pull request #218 from arc53/feature/web-widget
web widget
2023-04-23 15:12:18 +01:00
Alex
0c1a6a918d web widget 2023-04-23 15:07:55 +01:00
Alex
d1f5ff4dba Merge pull request #214 from SAMZONG/main 2023-04-18 15:12:54 +01:00
samzong
77e6df2a1c add auto sync fork for workflow
Signed-off-by: samzong <samzong.lu@gmail.com>
2023-04-18 04:24:10 +00:00
Alex
119c037f24 Merge pull request #209 from arc53/dot-env
.env
2023-04-11 22:50:19 +01:00
Alex
97fe1abfd8 .env
Co-Authored-By: Subhadip N <subhadip@get-deck.com>
2023-04-11 22:49:47 +01:00
Alex
3a0163f0fb Merge pull request #202 from yuchen9/feat/ui-enhancement
feat: ui enhancement
2023-04-07 11:17:12 +01:00
Chen
d3fab69155 feat: ui enhancement 2023-04-06 23:54:16 +08:00
Alex
9395d2c091 celery load 2023-04-06 12:16:30 +01:00
Alex
b9efb98280 Update README.md 2023-04-04 14:12:35 +01:00
Alex
60bb264663 async calls 2023-04-03 14:37:09 +01:00
Alex
316dd2f165 Merge pull request #197 from arc53/dependabot/pip/application/redis-4.5.4
Bump redis from 4.5.3 to 4.5.4 in /application
2023-04-03 13:01:13 +01:00
dependabot[bot]
8a0f700563 Bump redis from 4.5.3 to 4.5.4 in /application
Bumps [redis](https://github.com/redis/redis-py) from 4.5.3 to 4.5.4.
- [Release notes](https://github.com/redis/redis-py/releases)
- [Changelog](https://github.com/redis/redis-py/blob/master/CHANGES)
- [Commits](https://github.com/redis/redis-py/compare/v4.5.3...v4.5.4)

---
updated-dependencies:
- dependency-name: redis
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-03-31 14:31:03 +00:00
Alex
3d0c6eafec gpt4- compatable 2023-03-31 10:45:40 +01:00
Alex
46e055833b Merge pull request #196 from arc53/dependabot/pip/scripts/redis-4.5.4
Bump redis from 4.5.3 to 4.5.4 in /scripts
2023-03-30 12:52:15 +01:00
dependabot[bot]
80dfdd1cb9 Bump redis from 4.5.3 to 4.5.4 in /scripts
Bumps [redis](https://github.com/redis/redis-py) from 4.5.3 to 4.5.4.
- [Release notes](https://github.com/redis/redis-py/releases)
- [Changelog](https://github.com/redis/redis-py/blob/master/CHANGES)
- [Commits](https://github.com/redis/redis-py/compare/v4.5.3...v4.5.4)

---
updated-dependencies:
- dependency-name: redis
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-03-30 11:49:21 +00:00
Alex
db21678b74 Merge pull request #192 from arc53/dependabot/pip/scripts/redis-4.5.3
Bump redis from 4.5.1 to 4.5.3 in /scripts
2023-03-30 12:48:58 +01:00
Alex
09c7fe0565 Merge pull request #193 from arc53/dependabot/pip/application/redis-4.5.3
Bump redis from 4.5.2 to 4.5.3 in /application
2023-03-30 12:48:35 +01:00
Alex
b6dfb2c856 map_reduce 2023-03-30 12:44:25 +01:00
Alex
ab46ba521f different prompts 2023-03-29 18:36:58 +01:00
Alex
4a7670f2aa Update app.py 2023-03-29 17:32:00 +01:00
Alex
9ba86bc174 Update preferenceSlice.ts 2023-03-28 10:19:42 +01:00
Pavel
2ebe5e051c discord bot fix
Stop random answers
2023-03-28 01:51:54 +04:00
dependabot[bot]
24e98abd15 Bump redis from 4.5.2 to 4.5.3 in /application
Bumps [redis](https://github.com/redis/redis-py) from 4.5.2 to 4.5.3.
- [Release notes](https://github.com/redis/redis-py/releases)
- [Changelog](https://github.com/redis/redis-py/blob/master/CHANGES)
- [Commits](https://github.com/redis/redis-py/compare/v4.5.2...v4.5.3)

---
updated-dependencies:
- dependency-name: redis
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-03-27 21:36:39 +00:00
dependabot[bot]
b7f1a94ba4 Bump redis from 4.5.1 to 4.5.3 in /scripts
Bumps [redis](https://github.com/redis/redis-py) from 4.5.1 to 4.5.3.
- [Release notes](https://github.com/redis/redis-py/releases)
- [Changelog](https://github.com/redis/redis-py/blob/master/CHANGES)
- [Commits](https://github.com/redis/redis-py/compare/v4.5.1...v4.5.3)

---
updated-dependencies:
- dependency-name: redis
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-03-27 21:34:50 +00:00
Alex
70bc7465c9 Merge pull request #191 from arc53/features/little-fixes
Features/little fixes
2023-03-27 22:28:55 +01:00
Alex
65c2568427 Update app.py 2023-03-27 22:23:36 +01:00
Alex
186e7bf402 update for better runs + storage sync 2023-03-27 22:07:26 +01:00
Alex
e6f1c7d0c3 mobile more space 2023-03-27 21:50:54 +01:00
Alex
87ad9a3190 Update Upload.tsx 2023-03-27 21:48:44 +01:00
Alex
0ed45f8754 fix pending status 2023-03-27 21:48:16 +01:00
Alex
116e4401c4 Update .env.production 2023-03-27 21:44:22 +01:00
Alex
c3c0e643d2 Update chat_combine_prompt.txt 2023-03-27 21:42:06 +01:00
Alex
d5522e7c08 prep things 2023-03-27 19:29:10 +01:00
Alex
658b14ba26 failed upload 2023-03-27 19:22:06 +01:00
Alex
38f8469d0b Update Navigation.tsx 2023-03-27 19:11:57 +01:00
73 changed files with 7737 additions and 1952 deletions

2
.env-template Normal file
View File

@@ -0,0 +1,2 @@
OPENAI_API_KEY=<LLM api key (for example, open ai key)>
EMBEDDINGS_KEY=<LLM embeddings api key (for example, open ai key)>

View File

@@ -9,6 +9,10 @@ on:
jobs:
deploy:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
@@ -23,17 +27,17 @@ jobs:
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Login to ghcr.io
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GHCR_TOKEN }}
password: ${{ secrets.GITHUB_TOKEN }}
# Runs a single command using the runners shell
- name: Build and push Docker images to docker.io and ghcr.io
uses: docker/build-push-action@v2
uses: docker/build-push-action@v4
with:
file: './application/Dockerfile'
platforms: linux/amd64

View File

@@ -9,6 +9,10 @@ on:
jobs:
deploy:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
@@ -23,17 +27,17 @@ jobs:
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Login to ghcr.io
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GHCR_TOKEN }}
password: ${{ secrets.GITHUB_TOKEN }}
# Runs a single command using the runners shell
- name: Build and push Docker images to docker.io and ghcr.io
uses: docker/build-push-action@v2
uses: docker/build-push-action@v4
with:
file: './frontend/Dockerfile'
platforms: linux/amd64

17
.github/workflows/lint.yml vendored Normal file
View File

@@ -0,0 +1,17 @@
name: Python linting
on:
push:
branches:
- '*'
pull_request:
types: [ opened, synchronize ]
jobs:
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Lint with Ruff
uses: chartboost/ruff-action@v1

41
.github/workflows/sync_fork.yaml vendored Normal file
View File

@@ -0,0 +1,41 @@
name: Upstream Sync
permissions:
contents: write
on:
schedule:
- cron: "0 * * * *" # every hour
workflow_dispatch:
jobs:
sync_latest_from_upstream:
name: Sync latest commits from upstream repo
runs-on: ubuntu-latest
if: ${{ github.event.repository.fork }}
steps:
# Step 1: run a standard checkout action
- name: Checkout target repo
uses: actions/checkout@v3
# Step 2: run the sync action
- name: Sync upstream changes
id: sync
uses: aormsby/Fork-Sync-With-Upstream-action@v3.4
with:
# set your upstream repo and branch
upstream_sync_repo: arc53/DocsGPT
upstream_sync_branch: main
target_sync_branch: main
target_repo_token: ${{ secrets.GITHUB_TOKEN }} # automatically generated, no need to set
# Set test_mode true to run tests instead of the true action!!
test_mode: false
- name: Sync check
if: failure()
run: |
echo "::error::由于权限不足,导致同步失败(这是预期的行为),请前往仓库首页手动执行[Sync fork]。"
echo "::error::Due to insufficient permissions, synchronization failed (as expected). Please go to the repository homepage and manually perform [Sync fork]."
exit 1

2
.ruff.toml Normal file
View File

@@ -0,0 +1,2 @@
# Allow lines to be as long as 120 characters.
line-length = 120

View File

@@ -55,8 +55,9 @@ You can find our [Roadmap](https://github.com/orgs/arc53/projects/2) here, pleas
Note: Make sure you have docker installed
1. Open dowload this repository with `git clone https://github.com/arc53/DocsGPT.git`
2. Open docker-compose.yaml and replace <your_api_key> with your OpenAI's key (there are 4 places)
2. Create .env file in your root directory and set your OPENAI_API_KEY with your openai api key and VITE_API_STREAMING to true or false if you dont want streaming answers
3. Run `docker-compose build && docker-compose up`
4. Navigate to http://localhost:5173/
To stop just run Ctrl + C
@@ -67,19 +68,23 @@ Spin up only 2 containers from docker-compose.yaml (by deleting all services exc
Make sure you have python 3.10 or 3.11 installed
1. Navigate to `/application` folder
2. Install dependencies
2. Run `docker-compose -f docker-compose-dev.yaml build && docker-compose -f docker-compose-dev.yaml up -d`
3. Export required variables
`export CELERY_BROKER_URL=redis://localhost:6379/0`
`export CELERY_RESULT_BACKEND=redis://localhost:6379/1`
`export MONGO_URI=mongodb://localhost:27017/docsgpt`
4. Install dependencies
`pip install -r requirements.txt`
3. Prepare .env file
5. Prepare .env file
Copy .env_sample and create .env with your openai api token
4. Run the app
`python app.py`
5. Start worker with `celery -A app.celery worker -l INFO`
6. Run the app
`python wsgi.py`
7. Start worker with `celery -A app.celery worker -l INFO`
To start frontend
1. Navigate to `/frontend` folder
2. Install dependencies
`npm install`
3. In the file `.env.development` instead of `VITE_API_HOST = https://docsapi.arc53.com` use `VITE_API_HOST=http://localhost:5001`
3. Run the app
4. `npm run dev`

View File

@@ -4,7 +4,7 @@ FROM python:3.10-slim-bullseye as builder
RUN apt-get update && apt-get install -y gcc curl
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && apt-get install --reinstall libc6-dev -y
ENV PATH="/root/.cargo/bin:${PATH}"
RUN pip install --upgrade pip && pip install tiktoken==0.1.2
RUN pip install --upgrade pip && pip install tiktoken==0.3.3
COPY requirements.txt .
RUN pip install -r requirements.txt

View File

@@ -1,16 +1,20 @@
import asyncio
import datetime
import http.client
import json
import os
import traceback
import openai
import dotenv
import requests
from celery import Celery
from celery.result import AsyncResult
from flask import Flask, request, render_template, send_from_directory, jsonify
from flask import Flask, request, render_template, send_from_directory, jsonify, Response
from langchain import FAISS
from langchain import VectorDBQA, HuggingFaceHub, Cohere, OpenAI
from langchain.chains import ChatVectorDBChain
from langchain.chains import LLMChain, ConversationalRetrievalChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings, CohereEmbeddings, \
@@ -20,26 +24,19 @@ from langchain.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
AIMessagePromptTemplate,
)
from pymongo import MongoClient
from werkzeug.utils import secure_filename
from langchain.llms import GPT4All
from core.settings import settings
from error import bad_request
from worker import ingest_worker
# os.environ["LANGCHAIN_HANDLER"] = "langchain"
if os.getenv("LLM_NAME") is not None:
llm_choice = os.getenv("LLM_NAME")
else:
llm_choice = "openai_chat"
if os.getenv("EMBEDDINGS_NAME") is not None:
embeddings_choice = os.getenv("EMBEDDINGS_NAME")
else:
embeddings_choice = "openai_text-embedding-ada-002"
if llm_choice == "manifest":
if settings.LLM_NAME == "manifest":
from manifest import Manifest
from langchain.llms.manifest import ManifestWrapper
@@ -76,27 +73,71 @@ with open("prompts/chat_combine_prompt.txt", "r") as f:
with open("prompts/chat_reduce_prompt.txt", "r") as f:
chat_reduce_template = f.read()
if os.getenv("API_KEY") is not None:
if settings.API_KEY is not None:
api_key_set = True
else:
api_key_set = False
if os.getenv("EMBEDDINGS_KEY") is not None:
if settings.EMBEDDINGS_KEY is not None:
embeddings_key_set = True
else:
embeddings_key_set = False
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER = "inputs"
app.config['CELERY_BROKER_URL'] = os.getenv("CELERY_BROKER_URL")
app.config['CELERY_RESULT_BACKEND'] = os.getenv("CELERY_RESULT_BACKEND")
app.config['MONGO_URI'] = os.getenv("MONGO_URI")
celery = Celery(app.name, broker=app.config['CELERY_BROKER_URL'], backend=app.config['CELERY_RESULT_BACKEND'])
celery.conf.update(app.config)
app.config['CELERY_BROKER_URL'] = settings.CELERY_BROKER_URL
app.config['CELERY_RESULT_BACKEND'] = settings.CELERY_RESULT_BACKEND
app.config['MONGO_URI'] = settings.MONGO_URI
celery = Celery()
celery.config_from_object('celeryconfig')
mongo = MongoClient(app.config['MONGO_URI'])
db = mongo["docsgpt"]
vectors_collection = db["vectors"]
async def async_generate(chain, question, chat_history):
result = await chain.arun({"question": question, "chat_history": chat_history})
return result
def run_async_chain(chain, question, chat_history):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
result = {}
try:
answer = loop.run_until_complete(async_generate(chain, question, chat_history))
finally:
loop.close()
result["answer"] = answer
return result
def get_vectorstore(data):
if "active_docs" in data:
if data["active_docs"].split("/")[0] == "local":
if data["active_docs"].split("/")[1] == "default":
vectorstore = ""
else:
vectorstore = "indexes/" + data["active_docs"]
else:
vectorstore = "vectors/" + data["active_docs"]
if data['active_docs'] == "default":
vectorstore = ""
else:
vectorstore = ""
return vectorstore
def get_docsearch(vectorstore, embeddings_key):
if settings.EMBEDDINGS_NAME == "openai_text-embedding-ada-002":
docsearch = FAISS.load_local(vectorstore, OpenAIEmbeddings(openai_api_key=embeddings_key))
elif settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2":
docsearch = FAISS.load_local(vectorstore, HuggingFaceHubEmbeddings())
elif settings.EMBEDDINGS_NAME == "huggingface_hkunlp/instructor-large":
docsearch = FAISS.load_local(vectorstore, HuggingFaceInstructEmbeddings())
elif settings.EMBEDDINGS_NAME == "cohere_medium":
docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key))
return docsearch
@celery.task(bind=True)
def ingest(self, directory, formats, name_job, filename, user):
resp = ingest_worker(self, directory, formats, name_job, filename, user)
@@ -105,8 +146,68 @@ def ingest(self, directory, formats, name_job, filename, user):
@app.route("/")
def home():
return render_template("index.html", api_key_set=api_key_set, llm_choice=llm_choice,
embeddings_choice=embeddings_choice)
return render_template("index.html", api_key_set=api_key_set, llm_choice=settings.LLM_NAME,
embeddings_choice=settings.EMBEDDINGS_NAME)
def complete_stream(question, docsearch, chat_history, api_key):
openai.api_key = api_key
llm = ChatOpenAI(openai_api_key=api_key)
docs = docsearch.similarity_search(question, k=2)
# join all page_content together with a newline
docs_together = "\n".join([doc.page_content for doc in docs])
p_chat_combine = chat_combine_template.replace("{summaries}", docs_together)
messages_combine = [{"role": "system", "content": p_chat_combine}]
if len(chat_history) > 1:
tokens_current_history = 0
# count tokens in history
chat_history.reverse()
for i in chat_history:
if "prompt" in i and "response" in i:
tokens_batch = llm.get_num_tokens(i["prompt"]) + llm.get_num_tokens(i["response"])
if tokens_current_history + tokens_batch < settings.TOKENS_MAX_HISTORY:
tokens_current_history += tokens_batch
messages_combine.append({"role": "user", "content": i["prompt"]})
messages_combine.append({"role": "system", "content": i["response"]})
messages_combine.append({"role": "user", "content": question})
completion = openai.ChatCompletion.create(model="gpt-3.5-turbo",
messages=messages_combine, stream=True, max_tokens=500, temperature=0)
for line in completion:
if 'content' in line['choices'][0]['delta']:
# check if the delta contains content
data = json.dumps({"answer": str(line['choices'][0]['delta']['content'])})
yield f"data: {data}\n\n"
# send data.type = "end" to indicate that the stream has ended as json
data = json.dumps({"type": "end"})
yield f"data: {data}\n\n"
@app.route("/stream", methods=['POST', 'GET'])
def stream():
# get parameter from url question
question = request.args.get('question')
history = request.args.get('history')
# history to json object from string
history = json.loads(history)
# check if active_docs is set
if not api_key_set:
api_key = request.args.get("api_key")
else:
api_key = settings.API_KEY
if not embeddings_key_set:
embeddings_key = request.args.get("embeddings_key")
else:
embeddings_key = settings.EMBEDDINGS_KEY
if "active_docs" in request.args:
vectorstore = get_vectorstore({"active_docs": request.args.get("active_docs")})
else:
vectorstore = ""
docsearch = get_docsearch(vectorstore, embeddings_key)
#question = "Hi"
return Response(complete_stream(question, docsearch,
chat_history= history, api_key=api_key), mimetype='text/event-stream')
@app.route("/api/answer", methods=["POST"])
@@ -118,85 +219,82 @@ def api_answer():
if not api_key_set:
api_key = data["api_key"]
else:
api_key = os.getenv("API_KEY")
api_key = settings.API_KEY
if not embeddings_key_set:
embeddings_key = data["embeddings_key"]
else:
embeddings_key = os.getenv("EMBEDDINGS_KEY")
embeddings_key = settings.EMBEDDINGS_KEY
# use try and except to check for exception
try:
# check if the vectorstore is set
if "active_docs" in data:
if data["active_docs"].split("/")[0] == "local":
vectorstore = "indexes/" + data["active_docs"]
else:
vectorstore = "vectors/" + data["active_docs"]
if data['active_docs'] == "default":
vectorstore = ""
else:
vectorstore = ""
print(vectorstore)
# vectorstore = "outputs/inputs/"
vectorstore = get_vectorstore(data)
# loading the index and the store and the prompt template
# Note if you have used other embeddings than OpenAI, you need to change the embeddings
if embeddings_choice == "openai_text-embedding-ada-002":
docsearch = FAISS.load_local(vectorstore, OpenAIEmbeddings(openai_api_key=embeddings_key))
elif embeddings_choice == "huggingface_sentence-transformers/all-mpnet-base-v2":
docsearch = FAISS.load_local(vectorstore, HuggingFaceHubEmbeddings())
elif embeddings_choice == "huggingface_hkunlp/instructor-large":
docsearch = FAISS.load_local(vectorstore, HuggingFaceInstructEmbeddings())
elif embeddings_choice == "cohere_medium":
docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key))
# create a prompt template
if history:
history = json.loads(history)
template_temp = template_hist.replace("{historyquestion}", history[0]).replace("{historyanswer}",
history[1])
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template_temp,
template_format="jinja2")
else:
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template,
template_format="jinja2")
docsearch = get_docsearch(vectorstore, embeddings_key)
q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
template_format="jinja2")
if llm_choice == "openai_chat":
# llm = ChatOpenAI(openai_api_key=api_key, model_name="gpt-4")
llm = ChatOpenAI(openai_api_key=api_key)
messages_combine = [
SystemMessagePromptTemplate.from_template(chat_combine_template),
HumanMessagePromptTemplate.from_template("{question}")
]
if settings.LLM_NAME == "openai_chat":
llm = ChatOpenAI(openai_api_key=api_key) # optional parameter: model_name="gpt-4"
messages_combine = [SystemMessagePromptTemplate.from_template(chat_combine_template)]
if history:
tokens_current_history = 0
#count tokens in history
history.reverse()
for i in history:
if "prompt" in i and "response" in i:
tokens_batch = llm.get_num_tokens(i["prompt"]) + llm.get_num_tokens(i["response"])
if tokens_current_history + tokens_batch < settings.TOKENS_MAX_HISTORY:
tokens_current_history += tokens_batch
messages_combine.append(HumanMessagePromptTemplate.from_template(i["prompt"]))
messages_combine.append(AIMessagePromptTemplate.from_template(i["response"]))
messages_combine.append(HumanMessagePromptTemplate.from_template("{question}"))
import sys
print(messages_combine, file=sys.stderr)
p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
messages_reduce = [
SystemMessagePromptTemplate.from_template(chat_reduce_template),
HumanMessagePromptTemplate.from_template("{question}")
]
p_chat_reduce = ChatPromptTemplate.from_messages(messages_reduce)
elif llm_choice == "openai":
elif settings.LLM_NAME == "openai":
llm = OpenAI(openai_api_key=api_key, temperature=0)
elif llm_choice == "manifest":
elif settings.LLM_NAME == "manifest":
llm = ManifestWrapper(client=manifest, llm_kwargs={"temperature": 0.001, "max_tokens": 2048})
elif llm_choice == "huggingface":
elif settings.LLM_NAME == "huggingface":
llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key)
elif llm_choice == "cohere":
elif settings.LLM_NAME == "cohere":
llm = Cohere(model="command-xlarge-nightly", cohere_api_key=api_key)
elif settings.LLM_NAME == "gpt4all":
llm = GPT4All(model=settings.MODEL_PATH)
else:
raise ValueError("unknown LLM model")
if settings.LLM_NAME == "openai_chat":
question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
doc_chain = load_qa_chain(llm, chain_type="map_reduce", combine_prompt=p_chat_combine)
chain = ConversationalRetrievalChain(
retriever=docsearch.as_retriever(k=2),
question_generator=question_generator,
combine_docs_chain=doc_chain,
)
chat_history = []
# result = chain({"question": question, "chat_history": chat_history})
# generate async with async generate method
result = run_async_chain(chain, question, chat_history)
elif settings.LLM_NAME == "gpt4all":
question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
doc_chain = load_qa_chain(llm, chain_type="map_reduce", combine_prompt=p_chat_combine)
chain = ConversationalRetrievalChain(
retriever=docsearch.as_retriever(k=2),
question_generator=question_generator,
combine_docs_chain=doc_chain,
)
chat_history = []
# result = chain({"question": question, "chat_history": chat_history})
# generate async with async generate method
result = run_async_chain(chain, question, chat_history)
if llm_choice == "openai_chat":
chain = ChatVectorDBChain.from_llm(
llm=llm,
vectorstore=docsearch,
prompt=p_chat_combine,
qa_prompt=p_chat_reduce,
top_k_docs_for_context=3,
return_source_documents=False)
result = chain({"question": question, "chat_history": []})
else:
qa_chain = load_qa_chain(llm=llm, chain_type="map_reduce",
combine_prompt=c_prompt, question_prompt=q_prompt)
chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=docsearch, k=4)
combine_prompt=chat_combine_template, question_prompt=q_prompt)
chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=docsearch, k=3)
result = chain({"query": question})
print(result)
@@ -207,7 +305,7 @@ def api_answer():
result['answer'] = result['answer'].replace("\\n", "\n")
try:
result['answer'] = result['answer'].split("SOURCES:")[0]
except:
except Exception:
pass
# mock result
@@ -276,7 +374,7 @@ def api_feedback():
"feedback": feedback
})
)
return {"status": 'ok'}
return {"status": http.client.responses.get(response.status_code, 'ok')}
@app.route('/api/combine', methods=['GET'])
@@ -285,7 +383,17 @@ def combined_json():
"""Provide json file with combined available indexes."""
# get json from https://d3dg1063dc54p9.cloudfront.net/combined.json
data = []
data = [{
"name": 'default',
"language": 'default',
"version": '',
"description": 'default',
"fullName": 'default',
"date": 'default',
"docLink": 'default',
"model": settings.EMBEDDINGS_NAME,
"location": "local"
}]
# structure: name, language, version, description, fullName, date, docLink
# append data from vectors_collection
for index in vectors_collection.find({'user': user}):
@@ -297,7 +405,7 @@ def combined_json():
"fullName": index['name'],
"date": index['date'],
"docLink": index['location'],
"model": embeddings_choice,
"model": settings.EMBEDDINGS_NAME,
"location": "local"
})
@@ -335,7 +443,7 @@ def upload_file():
os.makedirs(save_dir)
file.save(os.path.join(save_dir, filename))
task = ingest.delay('temp', [".rst", ".md", ".pdf"], job_name, filename, user)
task = ingest.delay('temp', [".rst", ".md", ".pdf", ".txt"], job_name, filename, user)
# task id
task_id = task.id
return {"status": 'ok', "task_id": task_id}
@@ -388,7 +496,7 @@ def upload_index_files():
"language": job_name,
"location": save_dir,
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
"model": embeddings_choice,
"model": settings.EMBEDDINGS_NAME,
"type": "local"
})
return {"status": 'ok'}

View File

@@ -0,0 +1,8 @@
import os
broker_url = os.getenv("CELERY_BROKER_URL")
result_backend = os.getenv("CELERY_RESULT_BACKEND")
task_serializer = 'json'
result_serializer = 'json'
accept_content = ['json']

View File

View File

@@ -0,0 +1,22 @@
from pathlib import Path
from pydantic import BaseSettings
class Settings(BaseSettings):
LLM_NAME: str = "openai_chat"
EMBEDDINGS_NAME: str = "openai_text-embedding-ada-002"
CELERY_BROKER_URL: str = "redis://localhost:6379/0"
CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
MONGO_URI: str = "mongodb://localhost:27017/docsgpt"
MODEL_PATH: str = "./models/gpt4all-model.bin"
TOKENS_MAX_HISTORY: int = 150
API_URL: str = "http://localhost:5001" # backend url for celery worker
API_KEY: str = None # LLM api key
EMBEDDINGS_KEY: str = None # api key for embeddings (if using openai, just copy API_KEY
path = Path(__file__).parent.parent.absolute()
settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8")

View File

@@ -1,13 +1,15 @@
from flask import jsonify
from werkzeug.http import HTTP_STATUS_CODES
def response_error(code_status,message=None):
payload = {'error':HTTP_STATUS_CODES.get(code_status,"something went wrong")}
def response_error(code_status, message=None):
payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")}
if message:
payload['message'] = message
response = jsonify(payload)
response.status_code = code_status
return response
def bad_request(status_code=400,message=''):
return response_error(code_status=status_code,message=message)
def bad_request(status_code=400, message=''):
return response_error(code_status=status_code, message=message)

View File

@@ -3,7 +3,6 @@ from abc import abstractmethod
from typing import Any, List
from langchain.docstore.document import Document as LCDocument
from parser.schema.base import Document

View File

@@ -52,17 +52,17 @@ class SimpleDirectoryReader(BaseReader):
"""
def __init__(
self,
input_dir: Optional[str] = None,
input_files: Optional[List] = None,
exclude_hidden: bool = True,
errors: str = "ignore",
recursive: bool = True,
required_exts: Optional[List[str]] = None,
file_extractor: Optional[Dict[str, BaseParser]] = None,
num_files_limit: Optional[int] = None,
file_metadata: Optional[Callable[[str], Dict]] = None,
chunk_size_max: int = 2048,
self,
input_dir: Optional[str] = None,
input_files: Optional[List] = None,
exclude_hidden: bool = True,
errors: str = "ignore",
recursive: bool = True,
required_exts: Optional[List[str]] = None,
file_extractor: Optional[Dict[str, BaseParser]] = None,
num_files_limit: Optional[int] = None,
file_metadata: Optional[Callable[[str], Dict]] = None,
chunk_size_max: int = 2048,
) -> None:
"""Initialize with parameters."""
super().__init__()
@@ -102,8 +102,8 @@ class SimpleDirectoryReader(BaseReader):
elif self.exclude_hidden and input_file.name.startswith("."):
continue
elif (
self.required_exts is not None
and input_file.suffix not in self.required_exts
self.required_exts is not None
and input_file.suffix not in self.required_exts
):
continue
else:
@@ -114,7 +114,7 @@ class SimpleDirectoryReader(BaseReader):
new_input_files.extend(sub_input_files)
if self.num_files_limit is not None and self.num_files_limit > 0:
new_input_files = new_input_files[0 : self.num_files_limit]
new_input_files = new_input_files[0: self.num_files_limit]
# print total number of files added
logging.debug(

View File

@@ -9,6 +9,7 @@ from typing import Dict, Union
from parser.file.base_parser import BaseParser
class HTMLParser(BaseParser):
"""HTML parser."""
@@ -23,38 +24,37 @@ class HTMLParser(BaseParser):
Union[str, List[str]]: a string or a List of strings.
"""
try:
import unstructured
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
except ImportError:
raise ValueError("unstructured package is required to parse HTML files.")
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
# Using the unstructured library to convert the html to isd format
# isd sample : isd = [
# {"text": "My Title", "type": "Title"},
# {"text": "My Narrative", "type": "NarrativeText"}
# ]
# {"text": "My Title", "type": "Title"},
# {"text": "My Narrative", "type": "NarrativeText"}
# ]
with open(file, "r", encoding="utf-8") as fp:
elements = partition_html(file=fp)
isd = convert_to_isd(elements)
isd = convert_to_isd(elements)
# Removing non ascii charactwers from isd_el['text']
# Removing non ascii charactwers from isd_el['text']
for isd_el in isd:
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
# Removing all the \n characters from isd_el['text'] using regex and replace with single space
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
for isd_el in isd:
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
for isd_el in isd:
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
# Creating a list of all the indexes of isd_el['type'] = 'Title'
title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
# Creating 'Chunks' - List of lists of strings
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
@@ -64,19 +64,20 @@ class HTMLParser(BaseParser):
Chunks = [[]]
final_chunks = list(list())
for i,isd_el in enumerate(isd):
for i, isd_el in enumerate(isd):
if i in title_indexes:
Chunks.append([])
Chunks[-1].append(isd_el['text'])
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25
# TODO: This value can be an user defined variable
for chunk in Chunks:
# sum of lenth of all the strings in the chunk
sum = 0
sum += len(str(chunk))
if sum < 25:
Chunks.remove(chunk)
else :
else:
# appending all the approved chunks to final_chunks as a single string
final_chunks.append(" ".join([str(item) for item in chunk]))
return final_chunks

View File

@@ -7,8 +7,8 @@ import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from parser.file.base_parser import BaseParser
import tiktoken
from parser.file.base_parser import BaseParser
class MarkdownParser(BaseParser):
@@ -20,13 +20,13 @@ class MarkdownParser(BaseParser):
"""
def __init__(
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
max_tokens: int = 2048,
# remove_tables: bool = True,
**kwargs: Any,
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
max_tokens: int = 2048,
# remove_tables: bool = True,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
@@ -35,8 +35,8 @@ class MarkdownParser(BaseParser):
self._max_tokens = max_tokens
# self._remove_tables = remove_tables
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str):
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
current_text: str):
"""Append to tups chunk."""
num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
if num_tokens > self._max_tokens:
@@ -46,6 +46,7 @@ class MarkdownParser(BaseParser):
else:
tups.append((current_header, current_text))
return tups
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a markdown file to a dictionary.
@@ -115,7 +116,7 @@ class MarkdownParser(BaseParser):
return {}
def parse_tups(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples."""
with open(filepath, "r") as f:
@@ -130,7 +131,7 @@ class MarkdownParser(BaseParser):
return markdown_tups
def parse_file(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> Union[str, List[str]]:
"""Parse file into string."""
tups = self.parse_tups(filepath, errors=errors)

View File

@@ -5,10 +5,10 @@ Contains parser for md files.
"""
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from typing import Any, Dict, List, Optional, Tuple, Union
from parser.file.base_parser import BaseParser
import tiktoken
class RstParser(BaseParser):
"""reStructuredText parser.
@@ -19,17 +19,17 @@ class RstParser(BaseParser):
"""
def __init__(
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
remove_table_excess: bool = True,
remove_interpreters: bool = True,
remove_directives: bool = True,
remove_whitespaces_excess: bool = True,
#Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
**kwargs: Any,
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
remove_table_excess: bool = True,
remove_interpreters: bool = True,
remove_directives: bool = True,
remove_whitespaces_excess: bool = True,
# Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
@@ -41,7 +41,6 @@ class RstParser(BaseParser):
self._remove_whitespaces_excess = remove_whitespaces_excess
self._remove_characters_excess = remove_characters_excess
def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a reStructuredText file to a dictionary.
@@ -56,7 +55,8 @@ class RstParser(BaseParser):
for i, line in enumerate(lines):
header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
if header_match and i > 0 and (
len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
if current_header is not None:
if current_text == "" or None:
continue
@@ -72,7 +72,7 @@ class RstParser(BaseParser):
rst_tups.append((current_header, current_text))
#TODO: Format for rst
# TODO: Format for rst
#
# if current_header is not None:
# # pass linting, assert keys are defined
@@ -136,7 +136,7 @@ class RstParser(BaseParser):
return {}
def parse_tups(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples."""
with open(filepath, "r") as f:
@@ -159,7 +159,7 @@ class RstParser(BaseParser):
return rst_tups
def parse_file(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> Union[str, List[str]]:
"""Parse file into string."""
tups = self.parse_tups(filepath, errors=errors)

View File

@@ -77,13 +77,13 @@ class PandasCSVParser(BaseParser):
"""
def __init__(
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)

View File

@@ -1,6 +1,8 @@
import os
import javalang
def find_files(directory):
files_list = []
for root, dirs, files in os.walk(directory):
@@ -9,6 +11,7 @@ def find_files(directory):
files_list.append(os.path.join(root, file))
return files_list
def extract_functions(file_path):
with open(file_path, "r") as file:
java_code = file.read()
@@ -28,6 +31,7 @@ def extract_functions(file_path):
methods[method_name] = method_source_code
return methods
def extract_classes(file_path):
with open(file_path, 'r') as file:
source_code = file.read()
@@ -47,6 +51,7 @@ def extract_classes(file_path):
classes[class_name] = class_string
return classes
def extract_functions_and_classes(directory):
files = find_files(directory)
functions_dict = {}
@@ -58,4 +63,4 @@ def extract_functions_and_classes(directory):
classes = extract_classes(file)
if classes:
classes_dict[file] = classes
return functions_dict, classes_dict
return functions_dict, classes_dict

View File

@@ -1,6 +1,7 @@
import os
import esprima
import escodegen
import esprima
def find_files(directory):
@@ -11,6 +12,7 @@ def find_files(directory):
files_list.append(os.path.join(root, file))
return files_list
def extract_functions(file_path):
with open(file_path, 'r') as file:
source_code = file.read()
@@ -26,7 +28,6 @@ def extract_functions(file_path):
func_name = declaration.id.name if declaration.id else '<anonymous>'
functions[func_name] = escodegen.generate(declaration.init)
elif node.type == 'ClassDeclaration':
class_name = node.id.name
for subnode in node.body.body:
if subnode.type == 'MethodDefinition':
func_name = subnode.key.name
@@ -38,6 +39,7 @@ def extract_functions(file_path):
functions[func_name] = escodegen.generate(declaration.init)
return functions
def extract_classes(file_path):
with open(file_path, 'r') as file:
source_code = file.read()
@@ -53,6 +55,7 @@ def extract_classes(file_path):
classes[class_name] = ", ".join(function_names)
return classes
def extract_functions_and_classes(directory):
files = find_files(directory)
functions_dict = {}

View File

@@ -1,32 +1,32 @@
import os
import faiss
import pickle
import tiktoken
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
#from langchain.embeddings import HuggingFaceEmbeddings
#from langchain.embeddings import HuggingFaceInstructEmbeddings
#from langchain.embeddings import CohereEmbeddings
from langchain.vectorstores import FAISS
from retry import retry
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.embeddings import HuggingFaceInstructEmbeddings
# from langchain.embeddings import CohereEmbeddings
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
# Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
total_price = ((num_tokens/1000) * 0.0004)
total_price = ((num_tokens / 1000) * 0.0004)
return num_tokens, total_price
@retry(tries=10, delay=60)
def store_add_texts_with_retry(store, i):
store.add_texts([i.page_content], metadatas=[i.metadata])
#store_pine.add_texts([i.page_content], metadatas=[i.metadata])
# store_pine.add_texts([i.page_content], metadatas=[i.metadata])
def call_openai_api(docs, folder_name, task_status):
# Function to create a vector store from the documents and save it to disk.
# Function to create a vector store from the documents and save it to disk.
# create output folder if it doesn't exist
if not os.path.exists(f"{folder_name}"):
@@ -44,7 +44,8 @@ def call_openai_api(docs, folder_name, task_status):
# hf = HuggingFaceEmbeddings(model_name=model_name)
# store = FAISS.from_documents(docs_test, hf)
s1 = len(docs)
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'):
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
bar_format='{l_bar}{bar}| Time Left: {remaining}'):
try:
task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)})
store_add_texts_with_retry(store, i)
@@ -58,20 +59,20 @@ def call_openai_api(docs, folder_name, task_status):
c1 += 1
store.save_local(f"{folder_name}")
def get_user_permission(docs, folder_name):
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
#docs_content = (" ".join(docs))
# docs_content = (" ".join(docs))
docs_content = ""
for doc in docs:
docs_content += doc.page_content
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
print(f"Number of Tokens = {format(tokens, ',d')}")
print(f"Approx Cost = ${format(total_price, ',.2f')}")
#Here we check for user permission before calling the API.
# Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y":
call_openai_api(docs, folder_name)

View File

@@ -1,10 +1,12 @@
import os
import ast
import tiktoken
import os
from pathlib import Path
import tiktoken
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
def find_files(directory):
files_list = []
for root, dirs, files in os.walk(directory):
@@ -13,6 +15,7 @@ def find_files(directory):
files_list.append(os.path.join(root, file))
return files_list
def extract_functions(file_path):
with open(file_path, 'r') as file:
source_code = file.read()
@@ -25,6 +28,7 @@ def extract_functions(file_path):
functions[func_name] = func_def
return functions
def extract_classes(file_path):
with open(file_path, 'r') as file:
source_code = file.read()
@@ -40,6 +44,7 @@ def extract_classes(file_path):
classes[class_name] = ", ".join(function_names)
return classes
def extract_functions_and_classes(directory):
files = find_files(directory)
functions_dict = {}
@@ -53,11 +58,12 @@ def extract_functions_and_classes(directory):
classes_dict[file] = classes
return functions_dict, classes_dict
def parse_functions(functions_dict, formats, dir):
c1 = len(functions_dict)
for i, (source, functions) in enumerate(functions_dict.items(), start=1):
print(f"Processing file {i}/{c1}")
source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
subfolders = "/".join(source_w.split("/")[:-1])
Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
for j, (name, function) in enumerate(functions.items(), start=1):
@@ -70,18 +76,19 @@ def parse_functions(functions_dict, formats, dir):
response = llm(prompt.format(code=function))
mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
with open(f"outputs/{source_w}", mode) as f:
f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
f.write(
f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
def parse_classes(classes_dict, formats, dir):
c1 = len(classes_dict)
for i, (source, classes) in enumerate(classes_dict.items()):
print(f"Processing file {i+1}/{c1}")
source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
print(f"Processing file {i + 1}/{c1}")
source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
subfolders = "/".join(source_w.split("/")[:-1])
Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
for name, function_names in classes.items():
print(f"Processing Class {i+1}/{c1}")
print(f"Processing Class {i + 1}/{c1}")
prompt = PromptTemplate(
input_variables=["class_name", "functions_names"],
template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
@@ -92,6 +99,7 @@ def parse_classes(classes_dict, formats, dir):
with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
def transform_to_docs(functions_dict, classes_dict, formats, dir):
docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
@@ -110,4 +118,4 @@ def transform_to_docs(functions_dict, classes_dict, formats, dir):
parse_classes(classes_dict, formats, dir)
print("All done!")
else:
print("The API was not called. No money was spent.")
print("The API was not called. No money was spent.")

View File

@@ -2,7 +2,6 @@
from dataclasses import dataclass
from langchain.docstore.document import Document as LCDocument
from parser.schema.schema import BaseDocument

View File

@@ -1,9 +1,9 @@
import re
import tiktoken
from typing import List
from parser.schema.base import Document
from math import ceil
from typing import List
import tiktoken
from parser.schema.base import Document
def separate_header_and_body(text):
@@ -13,6 +13,7 @@ def separate_header_and_body(text):
body = text[len(header):]
return header, body
def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
docs = []
current_group = None
@@ -23,7 +24,8 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
if current_group is None:
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
elif len(tiktoken.get_encoding("cl100k_base").encode(
current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
current_group.text += " " + doc.text
else:
docs.append(current_group)
@@ -35,6 +37,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
return docs
def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
docs = []
for doc in documents:
@@ -54,17 +57,18 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
docs.append(new_doc)
return docs
def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
if token_check == False:
if not token_check:
return documents
print("Grouping small documents")
try:
documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
except:
except Exception:
print("Grouping failed, try running without token_check")
print("Separating large documents")
try:
documents = split_documents(documents=documents, max_tokens=max_tokens)
except:
except Exception:
print("Grouping failed, try running without token_check")
return documents

View File

@@ -1,4 +1,9 @@
You are a DocsGPT, friendly and helpful AI assistant by Arc53 that provides help with documents. You give thorough answers with code examples if possible.
Use the following pieces of context to help answer the users question.
Use the following pieces of context to help answer the users question. If its not relevant to the question, provide friendly responses.
You have access to chat history, and can use it to help answer the question.
When using code examples, use the following format:
```(language)
(code)
```
----------------
{summaries}

View File

@@ -1,3 +1,3 @@
Use the following portion of a long document to see if any of the text is relevant to answer the question.
{context}
Provide all relevant text to the question verbatim. Summarize if needed. If nothing relevant return "-".
Use the following pieces of context to help answer the users question. If its not relevant to the question, respond with "-"
----------------
{context}

View File

@@ -8,8 +8,8 @@ async-timeout==4.0.2
attrs==22.2.0
billiard==3.6.4.0
blobfile==2.0.1
boto3==1.26.84
botocore==1.29.84
boto3==1.26.102
botocore==1.29.102
cffi==1.15.1
charset-normalizer==3.1.0
click==8.1.3
@@ -27,8 +27,11 @@ entrypoints==0.4
faiss-cpu==1.7.3
filelock==3.9.0
Flask==2.2.3
Flask-Cors==3.0.10
frozenlist==1.3.3
geojson==2.5.0
greenlet==2.0.2
gpt4all==0.1.7
hub==3.0.1
huggingface-hub==0.12.1
humbug==0.2.8
@@ -38,14 +41,17 @@ Jinja2==3.1.2
jmespath==1.0.1
joblib==1.2.0
kombu==5.2.4
langchain==0.0.118
langchain==0.0.179
loguru==0.6.0
lxml==4.9.2
MarkupSafe==2.1.2
marshmallow==3.19.0
marshmallow-enum==1.5.1
mpmath==1.3.0
multidict==6.0.4
multiprocess==0.70.14
mypy-extensions==1.0.0
networkx==3.0
nltk==3.8.1
numcodecs==0.11.0
numpy==1.24.2
@@ -64,29 +70,37 @@ pycryptodomex==3.17
pydantic==1.10.5
PyJWT==2.6.0
pymongo==4.3.3
pyowm==3.3.0
PyPDF2==3.0.1
PySocks==1.7.1
python-dateutil==2.8.2
python-dotenv==1.0.0
python-jose==3.3.0
pytz==2022.7.1
PyYAML==6.0
redis==4.5.2
redis==4.5.4
regex==2022.10.31
requests==2.28.2
retry==0.9.2
rsa==4.9
s3transfer==0.6.0
scikit-learn==1.2.2
scipy==1.10.1
sentence-transformers==2.2.2
sentencepiece==0.1.97
six==1.16.0
SQLAlchemy==1.4.46
sympy==1.11.1
tenacity==8.2.2
tiktoken==0.3.0
tokenizers==0.13.2
threadpoolctl==3.1.0
torch==2.0.0
torchvision==0.15.1
tqdm==4.65.0
transformers==4.26.1
transformers==4.27.2
typer==0.7.0
typing-inspect==0.8.0
typing_extensions==4.5.0
urllib3==1.26.14
vine==5.0.0
wcwidth==0.2.6
Werkzeug==2.2.3
yarl==1.8.2

View File

@@ -1,28 +1,31 @@
import requests
import nltk
import os
from parser.file.bulk import SimpleDirectoryReader
from parser.schema.base import Document
from parser.open_ai_func import call_openai_api
from parser.token_func import group_split
from celery import current_task
import shutil
import string
import zipfile
import shutil
from urllib.parse import urljoin
import nltk
import requests
from core.settings import settings
from parser.file.bulk import SimpleDirectoryReader
from parser.open_ai_func import call_openai_api
from parser.schema.base import Document
from parser.token_func import group_split
try:
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
except FileExistsError:
pass
def metadata_from_filename(title):
return {'title': title}
def generate_random_string(length):
return ''.join([string.ascii_letters[i % 52] for i in range(length)])
def ingest_worker(self, directory, formats, name_job, filename, user):
# directory = 'inputs' or 'temp'
# formats = [".rst", ".md"]
@@ -39,12 +42,8 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
max_tokens = 1250
full_path = directory + '/' + user + '/' + name_job
# check if API_URL env variable is set
if not os.environ.get('API_URL'):
url = 'http://localhost:5001/api/download'
else:
url = os.environ.get('API_URL') + '/api/download'
file_data = {'name': name_job, 'file': filename, 'user': user}
response = requests.get(url, params=file_data)
response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data)
file = response.content
if not os.path.exists(full_path):
@@ -52,19 +51,17 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
with open(full_path + '/' + filename, 'wb') as f:
f.write(file)
#check if file is .zip and extract it
# check if file is .zip and extract it
if filename.endswith('.zip'):
with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref:
zip_ref.extractall(full_path)
os.remove(full_path + '/' + filename)
import time
self.update_state(state='PROGRESS', meta={'current': 1})
raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive,
required_exts=formats, num_files_limit=limit,
exclude_hidden=exclude).load_data()
exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
@@ -72,28 +69,26 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
call_openai_api(docs, full_path, self)
self.update_state(state='PROGRESS', meta={'current': 100})
if sample == True:
if sample:
for i in range(min(5, len(raw_docs))):
print(raw_docs[i].text)
# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
# and send them to the server (provide user and name in form)
if not os.environ.get('API_URL'):
url = 'http://localhost:5001/api/upload_index'
else:
url = os.environ.get('API_URL') + '/api/upload_index'
file_data = {'name': name_job, 'user': user}
files = {'file_faiss': open(full_path + '/index.faiss', 'rb'),
'file_pkl': open(full_path + '/index.pkl', 'rb')}
response = requests.post(url, files=files, data=file_data)
response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
#deletes remote
if not os.environ.get('API_URL'):
url = 'http://localhost:5001/api/delete_old?path=' + 'inputs/' + user + '/' + name_job
else:
url = os.environ.get('API_URL') + '/api/delete_old?path=' + 'inputs/' + user + '/' + name_job
response = requests.get(url)
response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path="))
# delete local
shutil.rmtree(full_path)
return {'directory': directory, 'formats': formats, 'name_job': name_job, 'filename': filename, 'user': user}
return {
'directory': directory,
'formats': formats,
'name_job': name_job,
'filename': filename,
'user': user,
'limited': False
}

View File

@@ -1,4 +1,4 @@
from app import app
if __name__ == "__main__":
app.run()
app.run(debug=True, port=5001)

20
docker-compose-dev.yaml Normal file
View File

@@ -0,0 +1,20 @@
version: "3.9"
services:
redis:
image: redis:6-alpine
ports:
- 6379:6379
mongo:
image: mongo:6
ports:
- 27017:27017
volumes:
- mongodb_data_container:/data/db
volumes:
mongodb_data_container:

View File

@@ -5,23 +5,26 @@ services:
build: ./frontend
environment:
- VITE_API_HOST=http://localhost:5001
- VITE_API_STREAMING=$VITE_API_STREAMING
ports:
- "5173:5173"
depends_on:
- backend
- backend
backend:
build: ./application
environment:
- API_KEY=<your_api_key>
- EMBEDDINGS_KEY=<your_api_key>
- API_KEY=$OPENAI_API_KEY
- EMBEDDINGS_KEY=$OPENAI_API_KEY
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/1
- MONGO_URI=mongodb://mongo:27017/docsgpt
ports:
- "5001:5001"
volumes:
- app_data_container:/app
- ./application/indexes:/app/indexes
- ./application/inputs:/app/inputs
- ./application/vectors:/app/vectors
depends_on:
- redis
- mongo
@@ -30,8 +33,8 @@ services:
build: ./application
command: celery -A app.celery worker -l INFO
environment:
- API_KEY=<your_api_key>
- EMBEDDINGS_KEY=<your_api_key>
- API_KEY=$OPENAI_API_KEY
- EMBEDDINGS_KEY=$OPENAI_API_KEY
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/1
- MONGO_URI=mongodb://mongo:27017/docsgpt
@@ -55,5 +58,4 @@ services:
volumes:
mongodb_data_container:
app_data_container:
mongodb_data_container:

View File

@@ -1,18 +1,20 @@
import requests
import dotenv
import os
import json
import pprint
import dotenv
import requests
from flask import Flask, request
dotenv.load_dotenv()
docsgpt_url = os.getenv("docsgpt_url")
chatwoot_url = os.getenv("chatwoot_url")
docsgpt_key = os.getenv("docsgpt_key")
chatwoot_token = os.getenv("chatwoot_token")
#account_id = os.getenv("account_id")
#assignee_id = os.getenv("assignee_id")
# account_id = os.getenv("account_id")
# assignee_id = os.getenv("assignee_id")
label_stop = "human-requested"
def send_to_bot(sender, message):
data = {
'sender': sender,
@@ -43,7 +45,6 @@ def send_to_chatwoot(account, conversation, message):
return r.json()
from flask import Flask, request
app = Flask(__name__)
@@ -74,7 +75,7 @@ def docsgpt():
# elif str(assignee) != str(assignee_id):
# return "Not the right assignee"
if(message_type == "incoming"):
if (message_type == "incoming"):
bot_response = send_to_bot(contact, message)
create_message = send_to_chatwoot(
account, conversation, bot_response)
@@ -83,5 +84,6 @@ def docsgpt():
return create_message
if __name__ == '__main__':
app.run(host='0.0.0.0', port=80)
app.run(host='0.0.0.0', port=80)

View File

@@ -10,7 +10,7 @@ dotenv.load_dotenv()
# Replace 'YOUR_BOT_TOKEN' with your bot's token
TOKEN = os.getenv("DISCORD_TOKEN")
PREFIX = '@docsgpt '
PREFIX = '@DocsGPT'
BASE_API_URL = 'http://localhost:5001'
intents = discord.Intents.default()
@@ -20,13 +20,11 @@ bot = commands.Bot(command_prefix=PREFIX, intents=intents)
def split_string(input_str):
pattern = r'<(.*?)>'
match = re.search(pattern, input_str)
pattern = r'^<@!?{0}>\s*'.format(bot.user.id)
match = re.match(pattern, input_str)
if match:
content = match.group(1)
rest = input_str[:match.start()] + input_str[match.end():]
return content, rest.strip()
content = input_str[match.end():].strip()
return str(bot.user.id), content
return None, input_str
@@ -59,8 +57,8 @@ async def on_message(message):
if prefix is None:
return
part_prefix = "@"
if part_prefix in prefix:
part_prefix = str(bot.user.id)
if part_prefix == prefix:
answer = await fetch_answer(content)
await message.channel.send(answer)

View File

@@ -0,0 +1,25 @@
# Chat Widget
A simple chat widget that can be easily integrated into any website.
## Installation
1. Host the `widget.html`, `styles.css`, and `script.js` files from the `src` folder on your own server or a Content Delivery Network (CDN). Make sure to note the URLs for these files.
2. Update the URLs in the `dist/chat-widget.js` file to match the locations of your hosted files:
```javascript
fetch("https://your-server-or-cdn.com/path/to/widget.html"),
fetch("https://your-server-or-cdn.com/path/to/styles.css"),
fetch("https://your-server-or-cdn.com/path/to/script.js"),
```
3. Host the `dist/chat-widget.js` file on your own server or a Content Delivery Network (CDN). Make sure to note the URL for this file.
##Integration
To integrate the chat widget into a website, add the following script tag to the HTML file, replacing URL_TO_CHAT_WIDGET_JS with the actual URL of your hosted chat-widget.js file:
```javascript
<script src="URL_TO_CHAT_WIDGET_JS"></script>
```

View File

@@ -0,0 +1,41 @@
(async function () {
// Fetch the HTML, CSS, and JavaScript from your server or CDN
const [htmlRes, jsRes] = await Promise.all([
fetch("https://s3-eu-west-2.amazonaws.com/arc53data/widget.html"),
// fetch("https://s3-eu-west-2.amazonaws.com/arc53data/tailwind.css"),
fetch("https://s3-eu-west-2.amazonaws.com/arc53data/script.js"),
]);
const html = await htmlRes.text();
//const css = await cssRes.text();
const js = await jsRes.text();
// create a new link element
const link = document.createElement("link");
//set the rel, href, type, and integrity attributes
link.rel = "stylesheet";
link.href = "https://cdn.tailwindcss.com/";
link.type = "text/css";
link.integrity = "sha384-PDOmVviaTm8N1W35y1NSmo80w6GPaGhbDuOBAF/5hRffaeGc6yOwIo1qAt4gqLGA%";
// get the document head and append the link element to it
// document.head.appendChild(link);
// Create a style element for the CSS
// const style = document.createElement("style");
// style.innerHTML = css;
// document.head.appendChild(style);
// Create a container for the chat widget and inject the HTML
const chatWidgetContainer = document.createElement("div");
chatWidgetContainer.innerHTML = html;
document.body.appendChild(chatWidgetContainer);
// Execute the JavaScript code
const script = document.createElement("script");
script.innerHTML = js;
document.body.appendChild(script);
})();

807
extensions/web-widget/dist/output.css vendored Normal file
View File

@@ -0,0 +1,807 @@
/*
! tailwindcss v3.3.1 | MIT License | https://tailwindcss.com
*/
/*
1. Prevent padding and border from affecting element width. (https://github.com/mozdevs/cssremedy/issues/4)
2. Allow adding a border to an element by just adding a border-width. (https://github.com/tailwindcss/tailwindcss/pull/116)
*/
*,
::before,
::after {
box-sizing: border-box;
/* 1 */
border-width: 0;
/* 2 */
border-style: solid;
/* 2 */
border-color: #e5e7eb;
/* 2 */
}
::before,
::after {
--tw-content: '';
}
/*
1. Use a consistent sensible line-height in all browsers.
2. Prevent adjustments of font size after orientation changes in iOS.
3. Use a more readable tab size.
4. Use the user's configured `sans` font-family by default.
5. Use the user's configured `sans` font-feature-settings by default.
6. Use the user's configured `sans` font-variation-settings by default.
*/
html {
line-height: 1.5;
/* 1 */
-webkit-text-size-adjust: 100%;
/* 2 */
-moz-tab-size: 4;
/* 3 */
-o-tab-size: 4;
tab-size: 4;
/* 3 */
font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
/* 4 */
font-feature-settings: normal;
/* 5 */
font-variation-settings: normal;
/* 6 */
}
/*
1. Remove the margin in all browsers.
2. Inherit line-height from `html` so users can set them as a class directly on the `html` element.
*/
body {
margin: 0;
/* 1 */
line-height: inherit;
/* 2 */
}
/*
1. Add the correct height in Firefox.
2. Correct the inheritance of border color in Firefox. (https://bugzilla.mozilla.org/show_bug.cgi?id=190655)
3. Ensure horizontal rules are visible by default.
*/
hr {
height: 0;
/* 1 */
color: inherit;
/* 2 */
border-top-width: 1px;
/* 3 */
}
/*
Add the correct text decoration in Chrome, Edge, and Safari.
*/
abbr:where([title]) {
-webkit-text-decoration: underline dotted;
text-decoration: underline dotted;
}
/*
Remove the default font size and weight for headings.
*/
h1,
h2,
h3,
h4,
h5,
h6 {
font-size: inherit;
font-weight: inherit;
}
/*
Reset links to optimize for opt-in styling instead of opt-out.
*/
a {
color: inherit;
text-decoration: inherit;
}
/*
Add the correct font weight in Edge and Safari.
*/
b,
strong {
font-weight: bolder;
}
/*
1. Use the user's configured `mono` font family by default.
2. Correct the odd `em` font sizing in all browsers.
*/
code,
kbd,
samp,
pre {
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
/* 1 */
font-size: 1em;
/* 2 */
}
/*
Add the correct font size in all browsers.
*/
small {
font-size: 80%;
}
/*
Prevent `sub` and `sup` elements from affecting the line height in all browsers.
*/
sub,
sup {
font-size: 75%;
line-height: 0;
position: relative;
vertical-align: baseline;
}
sub {
bottom: -0.25em;
}
sup {
top: -0.5em;
}
/*
1. Remove text indentation from table contents in Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=999088, https://bugs.webkit.org/show_bug.cgi?id=201297)
2. Correct table border color inheritance in all Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=935729, https://bugs.webkit.org/show_bug.cgi?id=195016)
3. Remove gaps between table borders by default.
*/
table {
text-indent: 0;
/* 1 */
border-color: inherit;
/* 2 */
border-collapse: collapse;
/* 3 */
}
/*
1. Change the font styles in all browsers.
2. Remove the margin in Firefox and Safari.
3. Remove default padding in all browsers.
*/
button,
input,
optgroup,
select,
textarea {
font-family: inherit;
/* 1 */
font-size: 100%;
/* 1 */
font-weight: inherit;
/* 1 */
line-height: inherit;
/* 1 */
color: inherit;
/* 1 */
margin: 0;
/* 2 */
padding: 0;
/* 3 */
}
/*
Remove the inheritance of text transform in Edge and Firefox.
*/
button,
select {
text-transform: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Remove default button styles.
*/
button,
[type='button'],
[type='reset'],
[type='submit'] {
-webkit-appearance: button;
/* 1 */
background-color: transparent;
/* 2 */
background-image: none;
/* 2 */
}
/*
Use the modern Firefox focus style for all focusable elements.
*/
:-moz-focusring {
outline: auto;
}
/*
Remove the additional `:invalid` styles in Firefox. (https://github.com/mozilla/gecko-dev/blob/2f9eacd9d3d995c937b4251a5557d95d494c9be1/layout/style/res/forms.css#L728-L737)
*/
:-moz-ui-invalid {
box-shadow: none;
}
/*
Add the correct vertical alignment in Chrome and Firefox.
*/
progress {
vertical-align: baseline;
}
/*
Correct the cursor style of increment and decrement buttons in Safari.
*/
::-webkit-inner-spin-button,
::-webkit-outer-spin-button {
height: auto;
}
/*
1. Correct the odd appearance in Chrome and Safari.
2. Correct the outline style in Safari.
*/
[type='search'] {
-webkit-appearance: textfield;
/* 1 */
outline-offset: -2px;
/* 2 */
}
/*
Remove the inner padding in Chrome and Safari on macOS.
*/
::-webkit-search-decoration {
-webkit-appearance: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Change font properties to `inherit` in Safari.
*/
::-webkit-file-upload-button {
-webkit-appearance: button;
/* 1 */
font: inherit;
/* 2 */
}
/*
Add the correct display in Chrome and Safari.
*/
summary {
display: list-item;
}
/*
Removes the default spacing and border for appropriate elements.
*/
blockquote,
dl,
dd,
h1,
h2,
h3,
h4,
h5,
h6,
hr,
figure,
p,
pre {
margin: 0;
}
fieldset {
margin: 0;
padding: 0;
}
legend {
padding: 0;
}
ol,
ul,
menu {
list-style: none;
margin: 0;
padding: 0;
}
/*
Prevent resizing textareas horizontally by default.
*/
textarea {
resize: vertical;
}
/*
1. Reset the default placeholder opacity in Firefox. (https://github.com/tailwindlabs/tailwindcss/issues/3300)
2. Set the default placeholder color to the user's configured gray 400 color.
*/
input::-moz-placeholder, textarea::-moz-placeholder {
opacity: 1;
/* 1 */
color: #9ca3af;
/* 2 */
}
input::placeholder,
textarea::placeholder {
opacity: 1;
/* 1 */
color: #9ca3af;
/* 2 */
}
/*
Set the default cursor for buttons.
*/
button,
[role="button"] {
cursor: pointer;
}
/*
Make sure disabled buttons don't get the pointer cursor.
*/
:disabled {
cursor: default;
}
/*
1. Make replaced elements `display: block` by default. (https://github.com/mozdevs/cssremedy/issues/14)
2. Add `vertical-align: middle` to align replaced elements more sensibly by default. (https://github.com/jensimmons/cssremedy/issues/14#issuecomment-634934210)
This can trigger a poorly considered lint error in some tools but is included by design.
*/
img,
svg,
video,
canvas,
audio,
iframe,
embed,
object {
display: block;
/* 1 */
vertical-align: middle;
/* 2 */
}
/*
Constrain images and videos to the parent width and preserve their intrinsic aspect ratio. (https://github.com/mozdevs/cssremedy/issues/14)
*/
img,
video {
max-width: 100%;
height: auto;
}
/* Make elements with the HTML hidden attribute stay hidden by default */
[hidden] {
display: none;
}
*, ::before, ::after {
--tw-border-spacing-x: 0;
--tw-border-spacing-y: 0;
--tw-translate-x: 0;
--tw-translate-y: 0;
--tw-rotate: 0;
--tw-skew-x: 0;
--tw-skew-y: 0;
--tw-scale-x: 1;
--tw-scale-y: 1;
--tw-pan-x: ;
--tw-pan-y: ;
--tw-pinch-zoom: ;
--tw-scroll-snap-strictness: proximity;
--tw-ordinal: ;
--tw-slashed-zero: ;
--tw-numeric-figure: ;
--tw-numeric-spacing: ;
--tw-numeric-fraction: ;
--tw-ring-inset: ;
--tw-ring-offset-width: 0px;
--tw-ring-offset-color: #fff;
--tw-ring-color: rgb(59 130 246 / 0.5);
--tw-ring-offset-shadow: 0 0 #0000;
--tw-ring-shadow: 0 0 #0000;
--tw-shadow: 0 0 #0000;
--tw-shadow-colored: 0 0 #0000;
--tw-blur: ;
--tw-brightness: ;
--tw-contrast: ;
--tw-grayscale: ;
--tw-hue-rotate: ;
--tw-invert: ;
--tw-saturate: ;
--tw-sepia: ;
--tw-drop-shadow: ;
--tw-backdrop-blur: ;
--tw-backdrop-brightness: ;
--tw-backdrop-contrast: ;
--tw-backdrop-grayscale: ;
--tw-backdrop-hue-rotate: ;
--tw-backdrop-invert: ;
--tw-backdrop-opacity: ;
--tw-backdrop-saturate: ;
--tw-backdrop-sepia: ;
}
::backdrop {
--tw-border-spacing-x: 0;
--tw-border-spacing-y: 0;
--tw-translate-x: 0;
--tw-translate-y: 0;
--tw-rotate: 0;
--tw-skew-x: 0;
--tw-skew-y: 0;
--tw-scale-x: 1;
--tw-scale-y: 1;
--tw-pan-x: ;
--tw-pan-y: ;
--tw-pinch-zoom: ;
--tw-scroll-snap-strictness: proximity;
--tw-ordinal: ;
--tw-slashed-zero: ;
--tw-numeric-figure: ;
--tw-numeric-spacing: ;
--tw-numeric-fraction: ;
--tw-ring-inset: ;
--tw-ring-offset-width: 0px;
--tw-ring-offset-color: #fff;
--tw-ring-color: rgb(59 130 246 / 0.5);
--tw-ring-offset-shadow: 0 0 #0000;
--tw-ring-shadow: 0 0 #0000;
--tw-shadow: 0 0 #0000;
--tw-shadow-colored: 0 0 #0000;
--tw-blur: ;
--tw-brightness: ;
--tw-contrast: ;
--tw-grayscale: ;
--tw-hue-rotate: ;
--tw-invert: ;
--tw-saturate: ;
--tw-sepia: ;
--tw-drop-shadow: ;
--tw-backdrop-blur: ;
--tw-backdrop-brightness: ;
--tw-backdrop-contrast: ;
--tw-backdrop-grayscale: ;
--tw-backdrop-hue-rotate: ;
--tw-backdrop-invert: ;
--tw-backdrop-opacity: ;
--tw-backdrop-saturate: ;
--tw-backdrop-sepia: ;
}
.fixed {
position: fixed;
}
.absolute {
position: absolute;
}
.relative {
position: relative;
}
.inset-y-0 {
top: 0px;
bottom: 0px;
}
.bottom-5 {
bottom: 1.25rem;
}
.left-5 {
left: 1.25rem;
}
.right-2 {
right: 0.5rem;
}
.z-50 {
z-index: 50;
}
.m-0 {
margin: 0px;
}
.-mx-2 {
margin-left: -0.5rem;
margin-right: -0.5rem;
}
.mt-1 {
margin-top: 0.25rem;
}
.flex {
display: flex;
}
.hidden {
display: none;
}
.w-full {
width: 100%;
}
.flex-1 {
flex: 1 1 0%;
}
.transform {
transform: translate(var(--tw-translate-x), var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));
}
.items-center {
align-items: center;
}
.justify-center {
justify-content: center;
}
.gap-2 {
gap: 0.5rem;
}
.divide-y > :not([hidden]) ~ :not([hidden]) {
--tw-divide-y-reverse: 0;
border-top-width: calc(1px * calc(1 - var(--tw-divide-y-reverse)));
border-bottom-width: calc(1px * var(--tw-divide-y-reverse));
}
.rounded-md {
border-radius: 0.375rem;
}
.rounded-b {
border-bottom-right-radius: 0.25rem;
border-bottom-left-radius: 0.25rem;
}
.border {
border-width: 1px;
}
.bg-transparent {
background-color: transparent;
}
.bg-gradient-to-br {
background-image: linear-gradient(to bottom right, var(--tw-gradient-stops));
}
.from-gray-100\/80 {
--tw-gradient-from: rgb(243 244 246 / 0.8) var(--tw-gradient-from-position);
--tw-gradient-from-position: ;
--tw-gradient-to: rgb(243 244 246 / 0) var(--tw-gradient-from-position);
--tw-gradient-to-position: ;
--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to);
}
.via-white {
--tw-gradient-via-position: ;
--tw-gradient-to: rgb(255 255 255 / 0) var(--tw-gradient-to-position);
--tw-gradient-to-position: ;
--tw-gradient-stops: var(--tw-gradient-from), #fff var(--tw-gradient-via-position), var(--tw-gradient-to);
}
.to-white {
--tw-gradient-to: #fff var(--tw-gradient-to-position);
--tw-gradient-to-position: ;
}
.p-3 {
padding: 0.75rem;
}
.px-2 {
padding-left: 0.5rem;
padding-right: 0.5rem;
}
.px-5 {
padding-left: 1.25rem;
padding-right: 1.25rem;
}
.py-3 {
padding-top: 0.75rem;
padding-bottom: 0.75rem;
}
.pl-5 {
padding-left: 1.25rem;
}
.pr-8 {
padding-right: 2rem;
}
.font-sans {
font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
}
.text-sm {
font-size: 0.875rem;
line-height: 1.25rem;
}
.text-xs {
font-size: 0.75rem;
line-height: 1rem;
}
.font-bold {
font-weight: 700;
}
.text-gray-400 {
--tw-text-opacity: 1;
color: rgb(156 163 175 / var(--tw-text-opacity));
}
.text-gray-600 {
--tw-text-opacity: 1;
color: rgb(75 85 99 / var(--tw-text-opacity));
}
.text-gray-700 {
--tw-text-opacity: 1;
color: rgb(55 65 81 / var(--tw-text-opacity));
}
.text-gray-800 {
--tw-text-opacity: 1;
color: rgb(31 41 55 / var(--tw-text-opacity));
}
.shadow {
--tw-shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1);
--tw-shadow-colored: 0 1px 3px 0 var(--tw-shadow-color), 0 1px 2px -1px var(--tw-shadow-color);
box-shadow: var(--tw-ring-offset-shadow, 0 0 #0000), var(--tw-ring-shadow, 0 0 #0000), var(--tw-shadow);
}
.backdrop-blur-sm {
--tw-backdrop-blur: blur(4px);
-webkit-backdrop-filter: var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);
backdrop-filter: var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);
}
.transition {
transition-property: color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, -webkit-backdrop-filter;
transition-property: color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, backdrop-filter;
transition-property: color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, backdrop-filter, -webkit-backdrop-filter;
transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);
transition-duration: 150ms;
}
.delay-200 {
transition-delay: 200ms;
}
.duration-300 {
transition-duration: 300ms;
}
.hover\:bg-gray-100:hover {
--tw-bg-opacity: 1;
background-color: rgb(243 244 246 / var(--tw-bg-opacity));
}
.focus\:outline-none:focus {
outline: 2px solid transparent;
outline-offset: 2px;
}
@media (prefers-color-scheme: dark) {
.dark\:divide-gray-700 > :not([hidden]) ~ :not([hidden]) {
--tw-divide-opacity: 1;
border-color: rgb(55 65 81 / var(--tw-divide-opacity));
}
.dark\:border-gray-700 {
--tw-border-opacity: 1;
border-color: rgb(55 65 81 / var(--tw-border-opacity));
}
.dark\:from-gray-900\/80 {
--tw-gradient-from: rgb(17 24 39 / 0.8) var(--tw-gradient-from-position);
--tw-gradient-from-position: ;
--tw-gradient-to: rgb(17 24 39 / 0) var(--tw-gradient-from-position);
--tw-gradient-to-position: ;
--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to);
}
.dark\:via-gray-900 {
--tw-gradient-via-position: ;
--tw-gradient-to: rgb(17 24 39 / 0) var(--tw-gradient-to-position);
--tw-gradient-to-position: ;
--tw-gradient-stops: var(--tw-gradient-from), #111827 var(--tw-gradient-via-position), var(--tw-gradient-to);
}
.dark\:to-gray-900 {
--tw-gradient-to: #111827 var(--tw-gradient-to-position);
--tw-gradient-to-position: ;
}
.dark\:text-gray-200 {
--tw-text-opacity: 1;
color: rgb(229 231 235 / var(--tw-text-opacity));
}
.dark\:text-gray-300 {
--tw-text-opacity: 1;
color: rgb(209 213 219 / var(--tw-text-opacity));
}
.dark\:text-gray-500 {
--tw-text-opacity: 1;
color: rgb(107 114 128 / var(--tw-text-opacity));
}
.dark\:text-white {
--tw-text-opacity: 1;
color: rgb(255 255 255 / var(--tw-text-opacity));
}
.dark\:hover\:bg-gray-800\/70:hover {
background-color: rgb(31 41 55 / 0.7);
}
}
@media (min-width: 768px) {
.md\:pl-0 {
padding-left: 0px;
}
}

View File

@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Chat Widget Test</title>
<link href="dist/output.css" rel="stylesheet">
</head>
<body>
<script src="dist/chat-widget.js"></script>
</body>
</html>

1002
extensions/web-widget/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
{
"name": "web-widget",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"devDependencies": {
"tailwindcss": "^3.3.1"
}
}

View File

@@ -0,0 +1,58 @@
<div id="docsgpt-widget" class="dark fixed bottom-5 left-5 pl-5 md:pl-0 z-50">
<style>
@keyframes dotBounce {
0%, 80%, 100% {
transform: translateY(0);
}
40% {
transform: translateY(-5px);
}
}
.dot-animation {
display: inline-block;
animation: dotBounce 1s infinite ease-in-out;
}
.delay-200 {
animation-delay: 200ms;
}
.delay-400 {
animation-delay: 400ms;
}
</style>
<div class="divide-y dark:divide-gray-700 rounded-md border dark:border-gray-700 bg-gradient-to-br from-gray-100/80 via-white to-white dark:from-gray-900/80 dark:via-gray-900 dark:to-gray-900 font-sans shadow backdrop-blur-sm" style="width: 18rem; transform: translateY(0%) translateZ(0px);"><div>
<div class="flex items-center gap-2 p-3">
<div id="docsgpt-init-message" class="flex-1">
<h3 class="text-sm font-bold text-gray-700 dark:text-gray-200">Looking for help with documentation?</h3>
<p class="mt-1 text-xs text-gray-400 dark:text-gray-500">DocsGPT AI assistant will help you with docs</p>
</div>
<div id="docsgpt-answer" class="hidden">
<p class="mt-1 text-xs text-gray-600 dark:text-gray-300">Come cool answer</p>
</div>
</div>
</div>
<div class="w-full">
<button id="ask-docsgpt" class="flex w-full justify-center px-5 py-3 text-sm text-gray-800 font-bold dark:text-white transition duration-300 hover:bg-gray-100 rounded-b dark:hover:bg-gray-800/70">
Ask DocsGPT
</button>
<form id="docsgpt-chat-form" class="relative w-full m-0 hidden" style="opacity: 1;" data-projection-id="1">
<input id="docsgpt-chat-input" type="text" class="w-full bg-transparent px-5 py-3 pr-8 text-sm text-gray-700 dark:text-white focus:outline-none" placeholder="What do you want to do?" value="">
<button class="absolute inset-y-0 right-2 -mx-2 px-2" type="submit" style="opacity: 0;" data-projection-id="2">
</button>
</form>
<p id="docsgpt-chat-processing" class="hidden flex w-full justify-center px-5 py-3 text-sm text-gray-800 font-bold dark:text-white transition duration-300 rounded-b animate-fadeIn animate-2s">
Processing<span class="dot-animation">.</span><span class="dot-animation delay-200">.</span><span class="dot-animation delay-400">.</span>
</p>
</div>
</div>
</div>

View File

@@ -0,0 +1,3 @@
@tailwind base;
@tailwind components;
@tailwind utilities;

View File

@@ -0,0 +1,56 @@
const API_ENDPOINT = "http://localhost:5001/api/answer"; // Replace with your API endpoint
const widgetInitMessage = document.getElementById("docsgpt-init-message");
const widgetAnswerMessage = document.getElementById("docsgpt-answer");
const widgetAnswerMessageP = widgetAnswerMessage.querySelector("p");
const askDocsGPTButton = document.getElementById("ask-docsgpt");
const chatInput = document.getElementById("docsgpt-chat-input");
const chatForm = document.getElementById("docsgpt-chat-form");
const chatProcessing = document.getElementById("docsgpt-chat-processing");
async function sendMessage(message) {
const requestData = {
"question": message,
"active_docs": "default",
"api_key": "token",
"embeddings_key": "token",
"model": "default",
"history": null,
}
const response = await fetch(API_ENDPOINT, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(requestData),
});
const data = await response.json();
return data.answer;
}
askDocsGPTButton.addEventListener("click", () => {
askDocsGPTButton.classList.add("hidden");
chatForm.classList.remove("hidden");
chatForm.focus();
widgetInitMessage.classList.remove("hidden");
widgetAnswerMessage.classList.add("hidden");
});
chatForm.addEventListener("submit", async (e) => {
e.preventDefault();
const message = chatInput.value.trim();
if (!message) return;
chatInput.value = "";
chatForm.classList.add("hidden");
chatProcessing.classList.remove("hidden");
const reply = await sendMessage(message);
chatProcessing.classList.add("hidden");
// inside <p> tag
widgetAnswerMessageP.innerHTML = reply;
widgetAnswerMessage.classList.remove("hidden");
widgetInitMessage.classList.add("hidden");
askDocsGPTButton.classList.remove("hidden");
});

View File

@@ -0,0 +1,10 @@
/** @type {import('tailwindcss').Config} */
module.exports = {
content: ["./src/**/*.{html,js}"],
theme: {
extend: {},
},
plugins: [],
}

View File

@@ -1,2 +1,2 @@
# Please put appropriate value
VITE_API_HOST = http://localhost:5001
VITE_API_HOST=http://localhost:5001

View File

@@ -1 +1 @@
VITE_API_HOST = https://docsapi.arc53.com
VITE_API_HOST = https://gptcloud.arc53.com

File diff suppressed because it is too large Load Diff

View File

@@ -24,12 +24,15 @@
"react": "^18.2.0",
"react-dom": "^18.2.0",
"react-dropzone": "^14.2.3",
"react-markdown": "^8.0.7",
"react-redux": "^8.0.5",
"react-router-dom": "^6.8.1"
"react-router-dom": "^6.8.1",
"react-syntax-highlighter": "^15.5.0"
},
"devDependencies": {
"@types/react": "^18.0.27",
"@types/react-dom": "^18.0.10",
"@types/react-syntax-highlighter": "^15.5.6",
"@typescript-eslint/eslint-plugin": "^5.51.0",
"@typescript-eslint/parser": "^5.51.0",
"@vitejs/plugin-react": "^3.1.0",

View File

@@ -38,9 +38,8 @@ export default function Navigation({
const [isDocsListOpen, setIsDocsListOpen] = useState(false);
const isApiKeySet = useSelector(selectApiKeyStatus);
const [apiKeyModalState, setApiKeyModalState] = useState<ActiveState>(
isApiKeySet ? 'INACTIVE' : 'ACTIVE',
);
const [apiKeyModalState, setApiKeyModalState] =
useState<ActiveState>('INACTIVE');
const isSelectedDocsSet = useSelector(selectSelectedDocsStatus);
const [selectedDocsModalState, setSelectedDocsModalState] =
@@ -148,7 +147,7 @@ export default function Navigation({
src={Arrow2}
alt="arrow"
className={`${
isDocsListOpen ? 'rotate-0' : '-rotate-90'
isDocsListOpen ? 'rotate-0' : 'rotate-180'
} mr-3 w-3 transition-all`}
/>
</div>

View File

@@ -71,19 +71,15 @@ export default function Conversation() {
};
return (
<div className="flex justify-center p-6">
<div className="flex justify-center p-4">
{queries.length > 0 && (
<div className="mt-20 flex w-10/12 flex-col transition-all md:w-3/4">
<div className="mt-20 flex flex-col transition-all md:w-3/4">
{queries.map((query, index) => {
return (
<Fragment key={index}>
<ConversationBubble
ref={endMessageRef}
className={`${
index === queries.length - 1 && status === 'loading'
? 'mb-24'
: 'mb-7'
}`}
className={'mb-7'}
key={`${index}QUESTION`}
message={query.prompt}
type="QUESTION"

View File

@@ -4,6 +4,9 @@ import { FEEDBACK, MESSAGE_TYPE } from './conversationModels';
import Alert from './../assets/alert.svg';
import { ReactComponent as Like } from './../assets/like.svg';
import { ReactComponent as Dislike } from './../assets/dislike.svg';
import ReactMarkdown from 'react-markdown';
import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter';
import { vscDarkPlus } from 'react-syntax-highlighter/dist/cjs/styles/prism';
const ConversationBubble = forwardRef<
HTMLDivElement,
@@ -19,14 +22,26 @@ const ConversationBubble = forwardRef<
ref,
) {
const [showFeedback, setShowFeedback] = useState(false);
const List = ({
ordered,
children,
}: {
ordered?: boolean;
children: React.ReactNode;
}) => {
const Tag = ordered ? 'ol' : 'ul';
return <Tag className="list-inside list-disc">{children}</Tag>;
};
let bubble;
if (type === 'QUESTION') {
bubble = (
<div ref={ref} className={`flex flex-row-reverse self-end ${className}`}>
<Avatar className="mt-4 text-2xl" avatar="🧑‍💻"></Avatar>
<div className="mr-2 ml-10 flex items-center rounded-3xl bg-blue-1000 py-5 px-5 text-white">
<p className="whitespace-pre-wrap break-words">{message}</p>
<div className="mr-2 ml-10 flex items-center rounded-3xl bg-blue-1000 p-3.5 text-white">
<ReactMarkdown className="whitespace-pre-wrap break-words">
{message}
</ReactMarkdown>
</div>
</div>
);
@@ -40,7 +55,7 @@ const ConversationBubble = forwardRef<
>
<Avatar className="mt-4 text-2xl" avatar="🦖"></Avatar>
<div
className={`ml-2 mr-5 flex items-center rounded-3xl bg-gray-1000 py-5 px-5 ${
className={`ml-2 mr-5 flex items-center rounded-3xl bg-gray-1000 p-3.5 ${
type === 'ERROR'
? ' rounded-lg border border-red-2000 bg-red-1000 p-2 text-red-3000'
: ''
@@ -49,7 +64,37 @@ const ConversationBubble = forwardRef<
{type === 'ERROR' && (
<img src={Alert} alt="alert" className="mr-2 inline" />
)}
<p className="whitespace-pre-wrap break-words">{message}</p>
<ReactMarkdown
className="whitespace-pre-wrap break-words"
components={{
code({ node, inline, className, children, ...props }) {
const match = /language-(\w+)/.exec(className || '');
return !inline && match ? (
<SyntaxHighlighter
PreTag="div"
language={match[1]}
{...props}
style={vscDarkPlus}
>
{String(children).replace(/\n$/, '')}
</SyntaxHighlighter>
) : (
<code className={className ? className : ''} {...props}>
{children}
</code>
);
},
ul({ node, children }) {
return <List>{children}</List>;
},
ol({ node, children }) {
return <List ordered>{children}</List>;
},
}}
>
{message}
</ReactMarkdown>
</div>
<div
className={`mr-2 flex items-center justify-center ${

View File

@@ -7,6 +7,7 @@ export function fetchAnswerApi(
question: string,
apiKey: string,
selectedDocs: Doc,
history: Array<any> = [],
): Promise<Answer> {
let namePath = selectedDocs.name;
if (selectedDocs.language === namePath) {
@@ -37,7 +38,7 @@ export function fetchAnswerApi(
question: question,
api_key: apiKey,
embeddings_key: apiKey,
history: localStorage.getItem('chatHistory'),
history: history,
active_docs: docPath,
}),
})
@@ -45,7 +46,7 @@ export function fetchAnswerApi(
if (response.ok) {
return response.json();
} else {
Promise.reject(response);
return Promise.reject(new Error(response.statusText));
}
})
.then((data) => {
@@ -54,6 +55,52 @@ export function fetchAnswerApi(
});
}
export function fetchAnswerSteaming(
question: string,
apiKey: string,
selectedDocs: Doc,
history: Array<any> = [],
onEvent: (event: MessageEvent) => void,
): Promise<Answer> {
let namePath = selectedDocs.name;
if (selectedDocs.language === namePath) {
namePath = '.project';
}
let docPath = 'default';
if (selectedDocs.location === 'local') {
docPath = 'local' + '/' + selectedDocs.name + '/';
} else if (selectedDocs.location === 'remote') {
docPath =
selectedDocs.language +
'/' +
namePath +
'/' +
selectedDocs.version +
'/' +
selectedDocs.model +
'/';
}
return new Promise<Answer>((resolve, reject) => {
const url = new URL(apiHost + '/stream');
url.searchParams.append('question', question);
url.searchParams.append('api_key', apiKey);
url.searchParams.append('embeddings_key', apiKey);
url.searchParams.append('active_docs', docPath);
url.searchParams.append('history', JSON.stringify(history));
const eventSource = new EventSource(url.href);
eventSource.onmessage = onEvent;
eventSource.onerror = (error) => {
console.log('Connection failed.');
eventSource.close();
};
});
}
export function sendFeedback(
prompt: string,
response: string,

View File

@@ -1,27 +1,65 @@
import { createAsyncThunk, createSlice, PayloadAction } from '@reduxjs/toolkit';
import store from '../store';
import { fetchAnswerApi } from './conversationApi';
import { Answer, ConversationState, Query } from './conversationModels';
import { fetchAnswerApi, fetchAnswerSteaming } from './conversationApi';
import { Answer, ConversationState, Query, Status } from './conversationModels';
const initialState: ConversationState = {
queries: [],
status: 'idle',
};
export const fetchAnswer = createAsyncThunk<
Answer,
{ question: string },
{ state: RootState }
>('fetchAnswer', async ({ question }, { getState }) => {
const state = getState();
const API_STREAMING = import.meta.env.VITE_API_STREAMING === 'true';
const answer = await fetchAnswerApi(
question,
state.preference.apiKey,
state.preference.selectedDocs!,
);
return answer;
});
export const fetchAnswer = createAsyncThunk<Answer, { question: string }>(
'fetchAnswer',
async ({ question }, { dispatch, getState }) => {
const state = getState() as RootState;
if (state.preference) {
if (API_STREAMING) {
await fetchAnswerSteaming(
question,
state.preference.apiKey,
state.preference.selectedDocs!,
state.conversation.queries,
(event) => {
const data = JSON.parse(event.data);
// check if the 'end' event has been received
if (data.type === 'end') {
// set status to 'idle'
dispatch(conversationSlice.actions.setStatus('idle'));
} else {
const result = data.answer;
dispatch(
updateStreamingQuery({
index: state.conversation.queries.length - 1,
query: { response: result },
}),
);
}
},
);
} else {
const answer = await fetchAnswerApi(
question,
state.preference.apiKey,
state.preference.selectedDocs!,
state.conversation.queries,
);
if (answer) {
dispatch(
updateQuery({
index: state.conversation.queries.length - 1,
query: { response: answer.answer },
}),
);
dispatch(conversationSlice.actions.setStatus('idle'));
}
}
}
return { answer: '', query: question, result: '' };
},
);
export const conversationSlice = createSlice({
name: 'conversation',
@@ -30,6 +68,21 @@ export const conversationSlice = createSlice({
addQuery(state, action: PayloadAction<Query>) {
state.queries.push(action.payload);
},
updateStreamingQuery(
state,
action: PayloadAction<{ index: number; query: Partial<Query> }>,
) {
const index = action.payload.index;
if (action.payload.query.response) {
state.queries[index].response =
(state.queries[index].response || '') + action.payload.query.response;
} else {
state.queries[index] = {
...state.queries[index],
...action.payload.query,
};
}
},
updateQuery(
state,
action: PayloadAction<{ index: number; query: Partial<Query> }>,
@@ -40,17 +93,15 @@ export const conversationSlice = createSlice({
...action.payload.query,
};
},
setStatus(state, action: PayloadAction<Status>) {
state.status = action.payload;
},
},
extraReducers(builder) {
builder
.addCase(fetchAnswer.pending, (state) => {
state.status = 'loading';
})
.addCase(fetchAnswer.fulfilled, (state, action) => {
state.status = 'idle';
state.queries[state.queries.length - 1].response =
action.payload.answer;
})
.addCase(fetchAnswer.rejected, (state, action) => {
state.status = 'failed';
state.queries[state.queries.length - 1].error =
@@ -65,5 +116,6 @@ export const selectQueries = (state: RootState) => state.conversation.queries;
export const selectStatus = (state: RootState) => state.conversation.status;
export const { addQuery, updateQuery } = conversationSlice.actions;
export const { addQuery, updateQuery, updateStreamingQuery } =
conversationSlice.actions;
export default conversationSlice.reducer;

View File

@@ -13,8 +13,18 @@ interface Preference {
}
const initialState: Preference = {
apiKey: '',
selectedDocs: null,
apiKey: 'xxx',
selectedDocs: {
name: 'default',
language: 'default',
location: 'default',
version: 'default',
description: 'default',
fullName: 'default',
dat: 'default',
docLink: 'default',
model: 'openai_text-embedding-ada-002',
} as Doc,
sourceDocs: null,
};
@@ -29,7 +39,7 @@ export const prefSlice = createSlice({
state.selectedDocs = action.payload;
},
setSourceDocs: (state, action) => {
state.sourceDocs?.push(...action.payload);
state.sourceDocs = action.payload;
},
},
});

View File

@@ -19,20 +19,27 @@ export default function Upload({
type: 'UPLOAD' | 'TRAINIING';
percentage: number;
taskId?: string;
failed?: boolean;
}>();
function Progress({
title,
isCancellable = false,
isFailed = false,
}: {
title: string;
isCancellable?: boolean;
isFailed?: boolean;
}) {
return (
<div className="mt-5 flex flex-col items-center gap-2">
<p className="text-xl tracking-[0.15px]">{title}...</p>
<p className="text-sm text-gray-2000">This may take several minutes</p>
<p className={`ml-5 text-xl text-red-400 ${isFailed ? '' : 'hidden'}`}>
Over the token limit, please consider uploading smaller document
</p>
<p className="mt-10 text-2xl">{progress?.percentage || 0}%</p>
<div className="mb-10 w-[50%]">
<div className="h-1 w-[100%] bg-blue-4000"></div>
<div
@@ -40,6 +47,7 @@ export default function Upload({
style={{ width: `${progress?.percentage || 0}%` }}
></div>
</div>
<button
onClick={() => {
setDocName('');
@@ -71,11 +79,28 @@ export default function Upload({
.then((data) => data.json())
.then((data) => {
if (data.status == 'SUCCESS') {
getDocs().then((data) => dispatch(setSourceDocs(data)));
setProgress(
(progress) => progress && { ...progress, percentage: 100 },
);
} else {
if (data.result.limited === true) {
getDocs().then((data) => dispatch(setSourceDocs(data)));
setProgress(
(progress) =>
progress && {
...progress,
percentage: 100,
failed: true,
},
);
} else {
getDocs().then((data) => dispatch(setSourceDocs(data)));
setProgress(
(progress) =>
progress && {
...progress,
percentage: 100,
failed: false,
},
);
}
} else if (data.status == 'PROGRESS') {
setProgress(
(progress) =>
progress && {
@@ -91,6 +116,7 @@ export default function Upload({
<Progress
title="Training is in progress"
isCancellable={progress?.percentage === 100}
isFailed={progress?.failed === true}
></Progress>
);
}
@@ -125,10 +151,18 @@ export default function Upload({
const { getRootProps, getInputProps, isDragActive } = useDropzone({
onDrop,
multiple: true,
multiple: false,
onDragEnter: doNothing,
onDragOver: doNothing,
onDragLeave: doNothing,
maxSize: 25000000,
accept: {
'application/pdf': ['.pdf'],
'text/plain': ['.txt'],
'text/x-rst': ['.rst'],
'text/x-markdown': ['.md'],
'application/zip': ['.zip'],
},
});
let view;
@@ -139,7 +173,10 @@ export default function Upload({
} else {
view = (
<>
<p className="mb-7 text-xl text-jet">Upload New Documentation</p>
<p className="text-xl text-jet">Upload New Documentation</p>
<p className="mb-3 text-xs text-gray-4000">
Please upload .pdf, .txt, .rst, .md, .zip limited to 25mb
</p>
<input
type="text"
className="h-10 w-[60%] rounded-md border-2 border-gray-5000 px-3 outline-none"

View File

@@ -1,20 +1,13 @@
import ast
import json
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import dotenv
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
import pickle
import dotenv
import tiktoken
import sys
from argparse import ArgumentParser
import ast
dotenv.load_dotenv()
ps = list(Path("inputs").glob("**/*.py"))
data = []
sources = []
@@ -24,13 +17,6 @@ for p in ps:
sources.append(p)
# with open('inputs/client.py', 'r') as f:
# tree = ast.parse(f.read())
# print(tree)
def get_functions_in_class(node):
functions = []
functions_code = []
@@ -64,21 +50,9 @@ for code in data:
c1 += 1
# save the structure dict as json
import json
with open('structure_dict.json', 'w') as f:
json.dump(structure_dict, f)
# llm = OpenAI(temperature=0)
# prompt = PromptTemplate(
# input_variables=["code"],
# template="Code: {code}, Documentation: ",
# )
#
# print(prompt.format(code="print('hello world')"))
# print(llm(prompt.format(code="print('hello world')")))
if not Path("outputs").exists():
Path("outputs").mkdir()
@@ -119,8 +93,3 @@ for source, classes in structure_dict.items():
else:
with open(f"outputs/{source_w}", "a") as f:
f.write(f"\n\nFunction: {functions[function]}, \nDocumentation: {response}")

View File

@@ -1,21 +1,20 @@
import os
import sys
import nltk
import dotenv
import typer
from collections import defaultdict
from typing import List, Optional
from parser.file.bulk import SimpleDirectoryReader
from parser.schema.base import Document
from parser.open_ai_func import call_openai_api, get_user_permission
from parser.py2doc import transform_to_docs
from parser.py2doc import extract_functions_and_classes as extract_py
from parser.js2doc import extract_functions_and_classes as extract_js
from parser.java2doc import extract_functions_and_classes as extract_java
from parser.token_func import group_split
import dotenv
import nltk
import typer
from parser.file.bulk import SimpleDirectoryReader
from parser.java2doc import extract_functions_and_classes as extract_java
from parser.js2doc import extract_functions_and_classes as extract_js
from parser.open_ai_func import call_openai_api, get_user_permission
from parser.py2doc import extract_functions_and_classes as extract_py
from parser.py2doc import transform_to_docs
from parser.schema.base import Document
from parser.token_func import group_split
dotenv.load_dotenv()
@@ -25,28 +24,32 @@ nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
#Splits all files in specified folder to documents
def metadata_from_filename(title):
return {'title': title}
# Splits all files in specified folder to documents
@app.command()
def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
help="Whether to skip price confirmation"),
help="Whether to skip price confirmation"),
dir: Optional[List[str]] = typer.Option(["inputs"],
help="""List of paths to directory for index creation.
E.g. --dir inputs --dir inputs2"""),
file: Optional[List[str]] = typer.Option(None,
help="""File paths to use (Optional; overrides dir).
help="""File paths to use (Optional; overrides dir).
E.g. --file inputs/1.md --file inputs/2.md"""),
recursive: Optional[bool] = typer.Option(True, help="Whether to recursively search in subdirectories."),
limit: Optional[int] = typer.Option(None, help="Maximum number of files to read."),
formats: Optional[List[str]] = typer.Option([".rst", ".md"],
help="""List of required extensions (list with .)
Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
help="""List of required extensions (list with .)
Currently supported:
.rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."),
sample: Optional[bool] = typer.Option(False, help="Whether to output sample of the first 5 split documents."),
sample: Optional[bool] = typer.Option(False,
help="Whether to output sample of the first 5 split documents."),
token_check: Optional[bool] = typer.Option(True, help="Whether to group small documents and split large."),
min_tokens: Optional[int] = typer.Option(150, help="Minimum number of tokens to not group."),
max_tokens: Optional[int] = typer.Option(2000, help="Maximum number of tokens to not split."),
):
"""
Creates index from specified location or files.
By default /inputs folder is used, .rst and .md are parsed.
@@ -55,23 +58,23 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
def process_one_docs(directory, folder_name):
raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive,
required_exts=formats, num_files_limit=limit,
exclude_hidden=exclude).load_data()
exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
# Here we split the documents, as needed, into smaller chunks.
# We do this due to the context limits of the LLMs.
raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
#Old method
raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens,
token_check=token_check)
# Old method
# text_splitter = RecursiveCharacterTextSplitter()
# docs = text_splitter.split_documents(raw_docs)
#Sample feature
if sample == True:
# Sample feature
if sample:
for i in range(min(5, len(raw_docs))):
print(raw_docs[i].text)
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
# Here we check for command line arguments for bot calls.
# If no argument exists or the yes is not True, then the
# user permission is requested to call the API.
@@ -98,12 +101,11 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
@app.command()
def convert(dir: Optional[str] = typer.Option("inputs",
help="""Path to directory to make documentation for.
help="""Path to directory to make documentation for.
E.g. --dir inputs """),
formats: Optional[str] = typer.Option("py",
help="""Required language.
help="""Required language.
py, js, java supported for now""")):
"""
Creates documentation linked to original functions from specified location.
By default /inputs folder is used, .py is parsed.
@@ -117,7 +119,7 @@ def convert(dir: Optional[str] = typer.Option("inputs",
else:
raise Exception("Sorry, language not supported yet")
transform_to_docs(functions_dict, classes_dict, formats, dir)
if __name__ == "__main__":
app()
app()

View File

@@ -1,38 +1,42 @@
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import pickle
import dotenv
import tiktoken
import sys
from argparse import ArgumentParser
from pathlib import Path
import dotenv
import faiss
import tiktoken
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
# Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
total_price = ((num_tokens/1000) * 0.0004)
total_price = ((num_tokens / 1000) * 0.0004)
return num_tokens, total_price
def call_openai_api():
# Function to create a vector store from the documents and save it to disk.
# Function to create a vector store from the documents and save it to disk.
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
faiss.write_index(store.index, "docs.index")
store.index = None
with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f)
def get_user_permission():
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
docs_content = (" ".join(docs))
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
print(f"Number of Tokens = {format(tokens, ',d')}")
print(f"Approx Cost = ${format(total_price, ',.2f')}")
#Here we check for user permission before calling the API.
# Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y":
call_openai_api()
@@ -41,7 +45,8 @@ def get_user_permission():
else:
print("The API was not called. No money was spent.")
#Load .env file
# Load .env file
dotenv.load_dotenv()
ap = ArgumentParser("Script for training DocsGPT on .rst documentation files.")

View File

@@ -1,71 +1,75 @@
import os
import pickle
import dotenv
import tiktoken
import sys
import faiss
import shutil
import sys
from argparse import ArgumentParser
from pathlib import Path
from langchain.vectorstores import FAISS
import dotenv
import faiss
import tiktoken
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from sphinx.cmd.build import main as sphinx_main
from argparse import ArgumentParser
def convert_rst_to_txt(src_dir, dst_dir):
# Check if the source directory exists
if not os.path.exists(src_dir):
raise Exception("Source directory does not exist")
# Walk through the source directory
for root, dirs, files in os.walk(src_dir):
for file in files:
# Check if the file has .rst extension
if file.endswith(".rst"):
# Construct the full path of the file
src_file = os.path.join(root, file.replace(".rst", ""))
# Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={src_file} " \
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
elif file.endswith(".md"):
# Rename the .md file to .rst file
src_file = os.path.join(root, file)
dst_file = os.path.join(root, file.replace(".md", ".rst"))
os.rename(src_file, dst_file)
# Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={dst_file} " \
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
# Check if the source directory exists
if not os.path.exists(src_dir):
raise Exception("Source directory does not exist")
# Walk through the source directory
for root, dirs, files in os.walk(src_dir):
for file in files:
# Check if the file has .rst extension
if file.endswith(".rst"):
# Construct the full path of the file
src_file = os.path.join(root, file.replace(".rst", ""))
# Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={src_file} " \
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
elif file.endswith(".md"):
# Rename the .md file to .rst file
src_file = os.path.join(root, file)
dst_file = os.path.join(root, file.replace(".md", ".rst"))
os.rename(src_file, dst_file)
# Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={dst_file} " \
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
# Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
total_price = ((num_tokens/1000) * 0.0004)
total_price = ((num_tokens / 1000) * 0.0004)
return num_tokens, total_price
def call_openai_api():
# Function to create a vector store from the documents and save it to disk.
# Function to create a vector store from the documents and save it to disk.
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
faiss.write_index(store.index, "docs.index")
store.index = None
with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f)
def get_user_permission():
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
docs_content = (" ".join(docs))
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
print(f"Number of Tokens = {format(tokens, ',d')}")
print(f"Approx Cost = ${format(total_price, ',.2f')}")
#Here we check for user permission before calling the API.
# Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y":
call_openai_api()
@@ -74,6 +78,7 @@ def get_user_permission():
else:
print("The API was not called. No money was spent.")
ap = ArgumentParser("Script for training DocsGPT on Sphinx documentation")
ap.add_argument("-i", "--inputs",
type=str,
@@ -81,17 +86,17 @@ ap.add_argument("-i", "--inputs",
help="Directory containing documentation files")
args = ap.parse_args()
#Load .env file
# Load .env file
dotenv.load_dotenv()
#Directory to vector
# Directory to vector
src_dir = args.inputs
dst_dir = "tmp"
convert_rst_to_txt(src_dir, dst_dir)
# Here we load in the data in the format that Notion exports it in.
ps = list(Path("tmp/"+ src_dir).glob("**/*.txt"))
ps = list(Path("tmp/" + src_dir).glob("**/*.txt"))
# parse all child directories
data = []

View File

@@ -3,7 +3,6 @@ from abc import abstractmethod
from typing import Any, List
from langchain.docstore.document import Document as LCDocument
from parser.schema.base import Document

View File

@@ -1,8 +1,5 @@
"""Simple reader that reads files of different formats from a directory."""
import logging
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
from parser.file.base import BaseReader
from parser.file.base_parser import BaseParser
from parser.file.docs_parser import DocxParser, PDFParser
@@ -12,6 +9,8 @@ from parser.file.markdown_parser import MarkdownParser
from parser.file.rst_parser import RstParser
from parser.file.tabular_parser import PandasCSVParser
from parser.schema.base import Document
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".pdf": PDFParser(),
@@ -52,17 +51,17 @@ class SimpleDirectoryReader(BaseReader):
"""
def __init__(
self,
input_dir: Optional[str] = None,
input_files: Optional[List] = None,
exclude_hidden: bool = True,
errors: str = "ignore",
recursive: bool = True,
required_exts: Optional[List[str]] = None,
file_extractor: Optional[Dict[str, BaseParser]] = None,
num_files_limit: Optional[int] = None,
file_metadata: Optional[Callable[[str], Dict]] = None,
chunk_size_max: int = 2048,
self,
input_dir: Optional[str] = None,
input_files: Optional[List] = None,
exclude_hidden: bool = True,
errors: str = "ignore",
recursive: bool = True,
required_exts: Optional[List[str]] = None,
file_extractor: Optional[Dict[str, BaseParser]] = None,
num_files_limit: Optional[int] = None,
file_metadata: Optional[Callable[[str], Dict]] = None,
chunk_size_max: int = 2048,
) -> None:
"""Initialize with parameters."""
super().__init__()
@@ -103,8 +102,8 @@ class SimpleDirectoryReader(BaseReader):
elif self.exclude_hidden and input_file.name.startswith("."):
continue
elif (
self.required_exts is not None
and input_file.suffix not in self.required_exts
self.required_exts is not None
and input_file.suffix not in self.required_exts
):
continue
else:
@@ -115,7 +114,7 @@ class SimpleDirectoryReader(BaseReader):
new_input_files.extend(sub_input_files)
if self.num_files_limit is not None and self.num_files_limit > 0:
new_input_files = new_input_files[0 : self.num_files_limit]
new_input_files = new_input_files[0: self.num_files_limit]
# print total number of files added
logging.debug(
@@ -151,10 +150,15 @@ class SimpleDirectoryReader(BaseReader):
data = f.read()
if isinstance(data, List):
data_list.extend(data)
if self.file_metadata is not None:
for _ in range(len(data)):
metadata_list.append(self.file_metadata(str(input_file)))
else:
data_list.append(str(data))
if self.file_metadata is not None:
metadata_list.append(self.file_metadata(str(input_file)))
if self.file_metadata is not None:
metadata_list.append(self.file_metadata(str(input_file)))
if concatenate:
return [Document("\n".join(data_list))]

View File

@@ -9,6 +9,7 @@ from typing import Dict, Union
from parser.file.base_parser import BaseParser
class HTMLParser(BaseParser):
"""HTML parser."""
@@ -23,21 +24,20 @@ class HTMLParser(BaseParser):
Union[str, List[str]]: a string or a List of strings.
"""
try:
import unstructured
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
except ImportError:
raise ValueError("unstructured package is required to parse HTML files.")
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
# Using the unstructured library to convert the html to isd format
# isd sample : isd = [
# {"text": "My Title", "type": "Title"},
# {"text": "My Narrative", "type": "NarrativeText"}
# ]
# {"text": "My Title", "type": "Title"},
# {"text": "My Narrative", "type": "NarrativeText"}
# ]
with open(file, "r", encoding="utf-8") as fp:
elements = partition_html(file=fp)
isd = convert_to_isd(elements)
isd = convert_to_isd(elements)
# Removing non ascii charactwers from isd_el['text']
for isd_el in isd:
@@ -46,15 +46,15 @@ class HTMLParser(BaseParser):
# Removing all the \n characters from isd_el['text'] using regex and replace with single space
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
for isd_el in isd:
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
for isd_el in isd:
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
# Creating a list of all the indexes of isd_el['type'] = 'Title'
title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
# Creating 'Chunks' - List of lists of strings
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
@@ -64,19 +64,20 @@ class HTMLParser(BaseParser):
Chunks = [[]]
final_chunks = list(list())
for i,isd_el in enumerate(isd):
for i, isd_el in enumerate(isd):
if i in title_indexes:
Chunks.append([])
Chunks[-1].append(isd_el['text'])
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25
# TODO: This value can be a user defined variable
for chunk in Chunks:
# sum of lenth of all the strings in the chunk
sum = 0
sum += len(str(chunk))
if sum < 25:
Chunks.remove(chunk)
else :
else:
# appending all the approved chunks to final_chunks as a single string
final_chunks.append(" ".join([str(item) for item in chunk]))
return final_chunks

View File

@@ -7,8 +7,8 @@ import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from parser.file.base_parser import BaseParser
import tiktoken
from parser.file.base_parser import BaseParser
class MarkdownParser(BaseParser):
@@ -20,13 +20,13 @@ class MarkdownParser(BaseParser):
"""
def __init__(
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
max_tokens: int = 2048,
# remove_tables: bool = True,
**kwargs: Any,
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
max_tokens: int = 2048,
# remove_tables: bool = True,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
@@ -35,8 +35,8 @@ class MarkdownParser(BaseParser):
self._max_tokens = max_tokens
# self._remove_tables = remove_tables
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str):
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
current_text: str):
"""Append to tups chunk."""
num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
if num_tokens > self._max_tokens:
@@ -46,6 +46,7 @@ class MarkdownParser(BaseParser):
else:
tups.append((current_header, current_text))
return tups
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a markdown file to a dictionary.
@@ -115,7 +116,7 @@ class MarkdownParser(BaseParser):
return {}
def parse_tups(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples."""
with open(filepath, "r") as f:
@@ -130,7 +131,7 @@ class MarkdownParser(BaseParser):
return markdown_tups
def parse_file(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> Union[str, List[str]]:
"""Parse file into string."""
tups = self.parse_tups(filepath, errors=errors)

View File

@@ -5,10 +5,10 @@ Contains parser for md files.
"""
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from typing import Any, Dict, List, Optional, Tuple, Union
from parser.file.base_parser import BaseParser
import tiktoken
class RstParser(BaseParser):
"""reStructuredText parser.
@@ -19,17 +19,17 @@ class RstParser(BaseParser):
"""
def __init__(
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
remove_table_excess: bool = True,
remove_interpreters: bool = True,
remove_directives: bool = True,
remove_whitespaces_excess: bool = True,
#Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
**kwargs: Any,
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
remove_table_excess: bool = True,
remove_interpreters: bool = True,
remove_directives: bool = True,
remove_whitespaces_excess: bool = True,
# Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
@@ -41,7 +41,6 @@ class RstParser(BaseParser):
self._remove_whitespaces_excess = remove_whitespaces_excess
self._remove_characters_excess = remove_characters_excess
def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a reStructuredText file to a dictionary.
@@ -56,7 +55,8 @@ class RstParser(BaseParser):
for i, line in enumerate(lines):
header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
if header_match and i > 0 and (
len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
if current_header is not None:
if current_text == "" or None:
continue
@@ -72,7 +72,7 @@ class RstParser(BaseParser):
rst_tups.append((current_header, current_text))
#TODO: Format for rst
# TODO: Format for rst
#
# if current_header is not None:
# # pass linting, assert keys are defined
@@ -136,7 +136,7 @@ class RstParser(BaseParser):
return {}
def parse_tups(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples."""
with open(filepath, "r") as f:
@@ -159,7 +159,7 @@ class RstParser(BaseParser):
return rst_tups
def parse_file(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> Union[str, List[str]]:
"""Parse file into string."""
tups = self.parse_tups(filepath, errors=errors)

View File

@@ -77,13 +77,13 @@ class PandasCSVParser(BaseParser):
"""
def __init__(
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)

View File

@@ -1,6 +1,8 @@
import os
import javalang
def find_files(directory):
files_list = []
for root, dirs, files in os.walk(directory):
@@ -9,6 +11,7 @@ def find_files(directory):
files_list.append(os.path.join(root, file))
return files_list
def extract_functions(file_path):
with open(file_path, "r") as file:
java_code = file.read()
@@ -28,6 +31,7 @@ def extract_functions(file_path):
methods[method_name] = method_source_code
return methods
def extract_classes(file_path):
with open(file_path, 'r') as file:
source_code = file.read()
@@ -47,6 +51,7 @@ def extract_classes(file_path):
classes[class_name] = class_string
return classes
def extract_functions_and_classes(directory):
files = find_files(directory)
functions_dict = {}
@@ -58,4 +63,4 @@ def extract_functions_and_classes(directory):
classes = extract_classes(file)
if classes:
classes_dict[file] = classes
return functions_dict, classes_dict
return functions_dict, classes_dict

View File

@@ -1,6 +1,7 @@
import os
import esprima
import escodegen
import esprima
def find_files(directory):
@@ -11,6 +12,7 @@ def find_files(directory):
files_list.append(os.path.join(root, file))
return files_list
def extract_functions(file_path):
with open(file_path, 'r') as file:
source_code = file.read()
@@ -26,7 +28,6 @@ def extract_functions(file_path):
func_name = declaration.id.name if declaration.id else '<anonymous>'
functions[func_name] = escodegen.generate(declaration.init)
elif node.type == 'ClassDeclaration':
class_name = node.id.name
for subnode in node.body.body:
if subnode.type == 'MethodDefinition':
func_name = subnode.key.name
@@ -38,6 +39,7 @@ def extract_functions(file_path):
functions[func_name] = escodegen.generate(declaration.init)
return functions
def extract_classes(file_path):
with open(file_path, 'r') as file:
source_code = file.read()
@@ -53,6 +55,7 @@ def extract_classes(file_path):
classes[class_name] = ", ".join(function_names)
return classes
def extract_functions_and_classes(directory):
files = find_files(directory)
functions_dict = {}

View File

@@ -1,32 +1,32 @@
import os
import faiss
import pickle
import tiktoken
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
#from langchain.embeddings import HuggingFaceEmbeddings
#from langchain.embeddings import HuggingFaceInstructEmbeddings
#from langchain.embeddings import CohereEmbeddings
from langchain.vectorstores import FAISS
from retry import retry
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.embeddings import HuggingFaceInstructEmbeddings
# from langchain.embeddings import CohereEmbeddings
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
# Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
total_price = ((num_tokens/1000) * 0.0004)
total_price = ((num_tokens / 1000) * 0.0004)
return num_tokens, total_price
@retry(tries=10, delay=60)
def store_add_texts_with_retry(store, i):
store.add_texts([i.page_content], metadatas=[i.metadata])
#store_pine.add_texts([i.page_content], metadatas=[i.metadata])
# store_pine.add_texts([i.page_content], metadatas=[i.metadata])
def call_openai_api(docs, folder_name):
# Function to create a vector store from the documents and save it to disk.
# Function to create a vector store from the documents and save it to disk.
# create output folder if it doesn't exist
if not os.path.exists(f"outputs/{folder_name}"):
@@ -37,21 +37,22 @@ def call_openai_api(docs, folder_name):
# remove the first element from docs
docs.pop(0)
# cut first n docs if you want to restart
#docs = docs[:n]
# docs = docs[:n]
c1 = 0
# pinecone.init(
# api_key="", # find at app.pinecone.io
# environment="us-east1-gcp" # next to api key in console
# )
#index_name = "pandas"
# index_name = "pandas"
store = FAISS.from_documents(docs_test, OpenAIEmbeddings())
#store_pine = Pinecone.from_documents(docs_test, OpenAIEmbeddings(), index_name=index_name)
# store_pine = Pinecone.from_documents(docs_test, OpenAIEmbeddings(), index_name=index_name)
# Uncomment for MPNet embeddings
# model_name = "sentence-transformers/all-mpnet-base-v2"
# hf = HuggingFaceEmbeddings(model_name=model_name)
# store = FAISS.from_documents(docs_test, hf)
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'):
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
bar_format='{l_bar}{bar}| Time Left: {remaining}'):
try:
store_add_texts_with_retry(store, i)
except Exception as e:
@@ -64,20 +65,20 @@ def call_openai_api(docs, folder_name):
c1 += 1
store.save_local(f"outputs/{folder_name}")
def get_user_permission(docs, folder_name):
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
#docs_content = (" ".join(docs))
# docs_content = (" ".join(docs))
docs_content = ""
for doc in docs:
docs_content += doc.page_content
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
print(f"Number of Tokens = {format(tokens, ',d')}")
print(f"Approx Cost = ${format(total_price, ',.2f')}")
#Here we check for user permission before calling the API.
# Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y":
call_openai_api(docs, folder_name)

View File

@@ -1,10 +1,12 @@
import os
import ast
import tiktoken
import os
from pathlib import Path
import tiktoken
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
def find_files(directory):
files_list = []
for root, dirs, files in os.walk(directory):
@@ -13,6 +15,7 @@ def find_files(directory):
files_list.append(os.path.join(root, file))
return files_list
def extract_functions(file_path):
with open(file_path, 'r') as file:
source_code = file.read()
@@ -25,6 +28,7 @@ def extract_functions(file_path):
functions[func_name] = func_def
return functions
def extract_classes(file_path):
with open(file_path, 'r') as file:
source_code = file.read()
@@ -40,6 +44,7 @@ def extract_classes(file_path):
classes[class_name] = ", ".join(function_names)
return classes
def extract_functions_and_classes(directory):
files = find_files(directory)
functions_dict = {}
@@ -53,11 +58,12 @@ def extract_functions_and_classes(directory):
classes_dict[file] = classes
return functions_dict, classes_dict
def parse_functions(functions_dict, formats, dir):
c1 = len(functions_dict)
for i, (source, functions) in enumerate(functions_dict.items(), start=1):
print(f"Processing file {i}/{c1}")
source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
subfolders = "/".join(source_w.split("/")[:-1])
Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
for j, (name, function) in enumerate(functions.items(), start=1):
@@ -70,18 +76,19 @@ def parse_functions(functions_dict, formats, dir):
response = llm(prompt.format(code=function))
mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
with open(f"outputs/{source_w}", mode) as f:
f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
f.write(
f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
def parse_classes(classes_dict, formats, dir):
c1 = len(classes_dict)
for i, (source, classes) in enumerate(classes_dict.items()):
print(f"Processing file {i+1}/{c1}")
source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
print(f"Processing file {i + 1}/{c1}")
source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
subfolders = "/".join(source_w.split("/")[:-1])
Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
for name, function_names in classes.items():
print(f"Processing Class {i+1}/{c1}")
print(f"Processing Class {i + 1}/{c1}")
prompt = PromptTemplate(
input_variables=["class_name", "functions_names"],
template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
@@ -92,6 +99,7 @@ def parse_classes(classes_dict, formats, dir):
with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
def transform_to_docs(functions_dict, classes_dict, formats, dir):
docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
@@ -110,4 +118,4 @@ def transform_to_docs(functions_dict, classes_dict, formats, dir):
parse_classes(classes_dict, formats, dir)
print("All done!")
else:
print("The API was not called. No money was spent.")
print("The API was not called. No money was spent.")

View File

@@ -2,7 +2,6 @@
from dataclasses import dataclass
from langchain.docstore.document import Document as LCDocument
from parser.schema.schema import BaseDocument

View File

@@ -1,9 +1,9 @@
import re
import tiktoken
from typing import List
from parser.schema.base import Document
from math import ceil
from typing import List
import tiktoken
from parser.schema.base import Document
def separate_header_and_body(text):
@@ -13,6 +13,7 @@ def separate_header_and_body(text):
body = text[len(header):]
return header, body
def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
docs = []
current_group = None
@@ -23,7 +24,8 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
if current_group is None:
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
elif len(tiktoken.get_encoding("cl100k_base").encode(
current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
current_group.text += " " + doc.text
else:
docs.append(current_group)
@@ -35,6 +37,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
return docs
def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
docs = []
for doc in documents:
@@ -54,17 +57,18 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
docs.append(new_doc)
return docs
def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
if token_check == False:
if not token_check:
return documents
print("Grouping small documents")
try:
documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
except:
except Exception:
print("Grouping failed, try running without token_check")
print("Separating large documents")
try:
documents = split_documents(documents=documents, max_tokens=max_tokens)
except:
except Exception:
print("Grouping failed, try running without token_check")
return documents

View File

@@ -33,7 +33,7 @@ esutils==1.0.1
et-xmlfile==1.1.0
faiss-cpu==1.7.3
filelock==3.9.0
Flask==2.2.2
Flask==2.2.5
frozenlist==1.3.3
greenlet==2.0.2
gunicorn==20.1.0
@@ -88,7 +88,7 @@ python-magic==0.4.27
python-pptx==0.6.21
pytz==2022.7.1
PyYAML==6.0
redis==4.5.1
redis==4.5.4
regex==2022.10.31
requests==2.28.2
retry==0.9.2
@@ -124,8 +124,7 @@ typing-inspect==0.8.0
typing_extensions==4.4.0
unstructured==0.4.11
urllib3==1.26.14
Werkzeug==2.2.3
wrapt==1.14.1
XlsxWriter==3.0.8
xxhash==3.2.0
yarl==1.8.2
yarl==1.8.2

45
setup.sh Executable file
View File

@@ -0,0 +1,45 @@
#!/bin/bash
cd "$(dirname "$0")" || exit
# Create the required directories on the host machine if they don't exist
[ ! -d "./application/indexes" ] && mkdir -p ./application/indexes
[ ! -d "./application/inputs" ] && mkdir -p ./application/inputs
[ ! -d "./application/vectors" ] && mkdir -p ./application/vectors
# Build frontend and backend images
docker build -t frontend_image ./frontend
docker build -t backend_image ./application
# Run redis and mongo services
docker run -d --name redis -p 6379:6379 redis:6-alpine
docker run -d --name mongo -p 27017:27017 -v mongodb_data_container:/data/db mongo:6
# Run backend and worker services
docker run -d --name backend -p 5001:5001 \
--link redis:redis --link mongo:mongo \
-v $(pwd)/application/indexes:/app/indexes \
-v $(pwd)/application/inputs:/app/inputs \
-v $(pwd)/application/vectors:/app/vectors \
-e API_KEY=$OPENAI_API_KEY \
-e EMBEDDINGS_KEY=$OPENAI_API_KEY \
-e CELERY_BROKER_URL=redis://redis:6379/0 \
-e CELERY_RESULT_BACKEND=redis://redis:6379/1 \
-e MONGO_URI=mongodb://mongo:27017/docsgpt \
backend_image
docker run -d --name worker \
--link redis:redis --link mongo:mongo \
-e API_KEY=$OPENAI_API_KEY \
-e EMBEDDINGS_KEY=$OPENAI_API_KEY \
-e CELERY_BROKER_URL=redis://redis:6379/0 \
-e CELERY_RESULT_BACKEND=redis://redis:6379/1 \
-e MONGO_URI=mongodb://mongo:27017/docsgpt \
-e API_URL=http://backend:5001 \
backend_image \
celery -A app.celery worker -l INFO
# Run frontend service
docker run -d --name frontend -p 5173:5173 \
-e VITE_API_HOST=http://localhost:5001 \
frontend_image