Compare commits
319 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
95fdedf12e | ||
|
|
c73dd776db | ||
|
|
891e5fea3f | ||
|
|
bb2f6f23b5 | ||
|
|
cd9b03bdb9 | ||
|
|
a619269502 | ||
|
|
9a33bf2210 | ||
|
|
34b4cd2231 | ||
|
|
6045cbbc62 | ||
|
|
9bbf4044e0 | ||
|
|
fcf8a64d91 | ||
|
|
2c6ab18e41 | ||
|
|
2fea294b13 | ||
|
|
b47ecab1a9 | ||
|
|
b86c294250 | ||
|
|
3eacfb91aa | ||
|
|
94164c2a71 | ||
|
|
d85eb83ea2 | ||
|
|
b2002639db | ||
|
|
347cfe253f | ||
|
|
833e1836e1 | ||
|
|
e4be38b9f7 | ||
|
|
783e7f6939 | ||
|
|
c1c54f4848 | ||
|
|
86be6be2d2 | ||
|
|
35a63e867a | ||
|
|
9c12a417ee | ||
|
|
32a019c0d6 | ||
|
|
b7e4a3c99e | ||
|
|
039062d071 | ||
|
|
83ae3e8371 | ||
|
|
852de8bdfc | ||
|
|
b8acb860aa | ||
|
|
e6849b85d1 | ||
|
|
8fa9657ba6 | ||
|
|
04b038960b | ||
|
|
52507a5a95 | ||
|
|
d8505ba2ab | ||
|
|
fa26c0997e | ||
|
|
5a0aadd2ae | ||
|
|
025549ebf8 | ||
|
|
e85a583f0a | ||
|
|
f7244ddb7a | ||
|
|
d983a519e3 | ||
|
|
ae01070b8f | ||
|
|
b2118602d9 | ||
|
|
9303f3b47b | ||
|
|
e5c43cfc4b | ||
|
|
45fc08e221 | ||
|
|
67e8511106 | ||
|
|
4f7fd0a62b | ||
|
|
88fe454962 | ||
|
|
26f7a9be0a | ||
|
|
9256926bb7 | ||
|
|
2a83318739 | ||
|
|
d6e2535a5e | ||
|
|
2bffb7e22c | ||
|
|
24a162cf86 | ||
|
|
f3104f3bc4 | ||
|
|
45f1bf6709 | ||
|
|
40b2590815 | ||
|
|
dd9ab46b5c | ||
|
|
c2aeadae33 | ||
|
|
1bd9759ab7 | ||
|
|
dcdbb05168 | ||
|
|
ae117c47e9 | ||
|
|
7f7856f0e4 | ||
|
|
aa7b7c8619 | ||
|
|
ee0cbff245 | ||
|
|
c2c18b25d2 | ||
|
|
816c7c95ed | ||
|
|
cb5d65d11a | ||
|
|
75f3f43ba0 | ||
|
|
9a521355ed | ||
|
|
47bfdf0710 | ||
|
|
e1b49c3fb4 | ||
|
|
374dffc5fa | ||
|
|
4f735a5d11 | ||
|
|
94738d8fc4 | ||
|
|
adb4bfa10b | ||
|
|
48e6bbdc97 | ||
|
|
b54d6fea44 | ||
|
|
4462e6339d | ||
|
|
c1581b69f4 | ||
|
|
14284e0cc7 | ||
|
|
de40e733ec | ||
|
|
9d91b6f780 | ||
|
|
6a8b49f9c4 | ||
|
|
445a8a5647 | ||
|
|
83ce4a538a | ||
|
|
35a19d2007 | ||
|
|
505e12c5ea | ||
|
|
b2bfd7f23a | ||
|
|
cdb96e715d | ||
|
|
b3e5f09e3b | ||
|
|
db542d668a | ||
|
|
a8a79a55a4 | ||
|
|
47f62a87a7 | ||
|
|
44f353861a | ||
|
|
a2ef84a4a0 | ||
|
|
12ac20ec43 | ||
|
|
ecfbc7b9fd | ||
|
|
ba2fe0fb1f | ||
|
|
890a20edba | ||
|
|
e6f48c9403 | ||
|
|
909f0afa69 | ||
|
|
5ed2b99b8c | ||
|
|
7848751fd8 | ||
|
|
e593241d75 | ||
|
|
fcdc7b7aeb | ||
|
|
c3c7878f28 | ||
|
|
85f9ae5a0a | ||
|
|
98a97f34f5 | ||
|
|
98d647a3fe | ||
|
|
9a393b4f74 | ||
|
|
88d74235e1 | ||
|
|
36fa470348 | ||
|
|
33dce10bc3 | ||
|
|
feed0b288f | ||
|
|
1b7dc8a509 | ||
|
|
87cc3cf168 | ||
|
|
eac7b1e9f2 | ||
|
|
bb1a42df91 | ||
|
|
ac5ac3e9f1 | ||
|
|
bed25b317c | ||
|
|
1687e6682a | ||
|
|
22572c8ed1 | ||
|
|
8187a339f0 | ||
|
|
382c3930a2 | ||
|
|
a64a30c088 | ||
|
|
dac76a867f | ||
|
|
b2e86e105d | ||
|
|
b8e57c9b6f | ||
|
|
486a1bc9de | ||
|
|
b1b610f4b5 | ||
|
|
68447a6009 | ||
|
|
a55280b941 | ||
|
|
830462d525 | ||
|
|
ce8b29e9d0 | ||
|
|
6ab15f8eb1 | ||
|
|
96eb68e042 | ||
|
|
bf78bdd6d4 | ||
|
|
d998815847 | ||
|
|
00ba7b78ca | ||
|
|
0b735d94f1 | ||
|
|
301989540f | ||
|
|
e26b95a26f | ||
|
|
049c1ddb48 | ||
|
|
2f1c3075a2 | ||
|
|
b1a5068fd6 | ||
|
|
01fbd5d702 | ||
|
|
5916f92f1a | ||
|
|
5e45268f68 | ||
|
|
b8e28e0c12 | ||
|
|
04f824ea36 | ||
|
|
c216bea031 | ||
|
|
e72ef478dc | ||
|
|
897b4ef2cd | ||
|
|
2404899e28 | ||
|
|
a2dfc2cbdc | ||
|
|
92373b25a9 | ||
|
|
ce1840a9ae | ||
|
|
c4f4bdd789 | ||
|
|
ec5068e85b | ||
|
|
1d9d0ddf27 | ||
|
|
e393be90dd | ||
|
|
e633df06e4 | ||
|
|
0ff5f408d6 | ||
|
|
5eda42ff31 | ||
|
|
84168e22d0 | ||
|
|
b722845aff | ||
|
|
fd54682c02 | ||
|
|
f5e287ffa6 | ||
|
|
fb10a546d6 | ||
|
|
006897f1c0 | ||
|
|
968849e52b | ||
|
|
8bee47dc50 | ||
|
|
08250120d1 | ||
|
|
8892b70785 | ||
|
|
534e4cb591 | ||
|
|
489abdcb0b | ||
|
|
f6b6c2e9a3 | ||
|
|
43c016f024 | ||
|
|
c0e7d9cd8b | ||
|
|
5f687a31f8 | ||
|
|
f2d2478dee | ||
|
|
8a98789be1 | ||
|
|
87a5c8894a | ||
|
|
7e92ed4501 | ||
|
|
a57cdfff1e | ||
|
|
d4ff6d4d7a | ||
|
|
63d99d6a57 | ||
|
|
fce7d34171 | ||
|
|
e7df7f69b3 | ||
|
|
94cc18bd71 | ||
|
|
39024ce2ac | ||
|
|
7ac4f45e7b | ||
|
|
f209eebaf8 | ||
|
|
4889db78c9 | ||
|
|
bff200fede | ||
|
|
af6f783043 | ||
|
|
610adcbefc | ||
|
|
1d3631fa04 | ||
|
|
0630504664 | ||
|
|
577d58c92b | ||
|
|
899777632b | ||
|
|
6d5b698c39 | ||
|
|
dd9f1abcea | ||
|
|
b4bd34fb96 | ||
|
|
014971262d | ||
|
|
36ed69b07e | ||
|
|
bbf55ca46e | ||
|
|
3f88b04c4a | ||
|
|
f8910ba136 | ||
|
|
6c95d8b13e | ||
|
|
e6bccaaf4e | ||
|
|
3b8039a580 | ||
|
|
fae3f55010 | ||
|
|
20c877f75b | ||
|
|
8380858a82 | ||
|
|
d2358c399d | ||
|
|
c3af8a77af | ||
|
|
bc5a0b030b | ||
|
|
0b94f1717f | ||
|
|
aaa1249a41 | ||
|
|
ffaa22c49b | ||
|
|
0b78480977 | ||
|
|
ec4fc17e3a | ||
|
|
78b85fb664 | ||
|
|
6b6737613a | ||
|
|
da5d62cc1c | ||
|
|
6a68b63192 | ||
|
|
ff2e79fe7b | ||
|
|
1800e51b19 | ||
|
|
ba9c505249 | ||
|
|
bc9f1c17ed | ||
|
|
74845aed64 | ||
|
|
e49dd0cc6a | ||
|
|
27c45ae24a | ||
|
|
364a14adaf | ||
|
|
5c560b1dd5 | ||
|
|
28b8b88332 | ||
|
|
e39ef0cc9e | ||
|
|
8098d3fec8 | ||
|
|
059ffe09ea | ||
|
|
36a845c29e | ||
|
|
ce6f0dab56 | ||
|
|
f200ab10a4 | ||
|
|
3001688e0e | ||
|
|
a73774099e | ||
|
|
b28676d52c | ||
|
|
eef012b4d1 | ||
|
|
1417a1c020 | ||
|
|
962becb9a5 | ||
|
|
168648e789 | ||
|
|
7f56f57778 | ||
|
|
6cadddc2fc | ||
|
|
15fd54eac4 | ||
|
|
31350e6302 | ||
|
|
8742cdae0a | ||
|
|
4efcb388ff | ||
|
|
2d92e95c8a | ||
|
|
47e5d5684a | ||
|
|
b723e14d98 | ||
|
|
c9d24b8f42 | ||
|
|
43622e7ab1 | ||
|
|
5cfc185ba5 | ||
|
|
4be2635fbe | ||
|
|
0beafb8391 | ||
|
|
1d2654b9fa | ||
|
|
a4bc3673e7 | ||
|
|
fa080537e8 | ||
|
|
bdf67a7db7 | ||
|
|
db4cdc901c | ||
|
|
16a540b89b | ||
|
|
e00ec9ac3f | ||
|
|
fc760afdfc | ||
|
|
cb47bcdb0e | ||
|
|
8d62559ca8 | ||
|
|
dbe9c4dc18 | ||
|
|
1609b4562d | ||
|
|
b6cadb1d65 | ||
|
|
7aafac5b5e | ||
|
|
36f0aacb19 | ||
|
|
0c1a6a918d | ||
|
|
d1f5ff4dba | ||
|
|
77e6df2a1c | ||
|
|
119c037f24 | ||
|
|
97fe1abfd8 | ||
|
|
3a0163f0fb | ||
|
|
d3fab69155 | ||
|
|
9395d2c091 | ||
|
|
b9efb98280 | ||
|
|
60bb264663 | ||
|
|
316dd2f165 | ||
|
|
8a0f700563 | ||
|
|
3d0c6eafec | ||
|
|
46e055833b | ||
|
|
80dfdd1cb9 | ||
|
|
db21678b74 | ||
|
|
09c7fe0565 | ||
|
|
b6dfb2c856 | ||
|
|
ab46ba521f | ||
|
|
4a7670f2aa | ||
|
|
9ba86bc174 | ||
|
|
2ebe5e051c | ||
|
|
24e98abd15 | ||
|
|
b7f1a94ba4 | ||
|
|
70bc7465c9 | ||
|
|
65c2568427 | ||
|
|
186e7bf402 | ||
|
|
e6f1c7d0c3 | ||
|
|
87ad9a3190 | ||
|
|
0ed45f8754 | ||
|
|
116e4401c4 | ||
|
|
c3c0e643d2 | ||
|
|
d5522e7c08 | ||
|
|
658b14ba26 | ||
|
|
38f8469d0b |
9
.env-template
Normal file
@@ -0,0 +1,9 @@
|
||||
OPENAI_API_KEY=<LLM api key (for example, open ai key)>
|
||||
SELF_HOSTED_MODEL=false
|
||||
VITE_API_STREAMING=true
|
||||
|
||||
#For Azure
|
||||
OPENAI_API_BASE=
|
||||
OPENAI_API_VERSION=
|
||||
AZURE_DEPLOYMENT_NAME=
|
||||
AZURE_EMBEDDINGS_DEPLOYMENT_NAME=
|
||||
11
.github/workflows/ci.yml
vendored
@@ -8,7 +8,12 @@ on:
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
if: github.repository == 'arc53/DocsGPT'
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
@@ -23,17 +28,17 @@ jobs:
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GHCR_TOKEN }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
# Runs a single command using the runners shell
|
||||
- name: Build and push Docker images to docker.io and ghcr.io
|
||||
uses: docker/build-push-action@v2
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
file: './application/Dockerfile'
|
||||
platforms: linux/amd64
|
||||
|
||||
10
.github/workflows/cife.yml
vendored
@@ -9,6 +9,10 @@ on:
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
@@ -23,17 +27,17 @@ jobs:
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GHCR_TOKEN }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
# Runs a single command using the runners shell
|
||||
- name: Build and push Docker images to docker.io and ghcr.io
|
||||
uses: docker/build-push-action@v2
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
file: './frontend/Dockerfile'
|
||||
platforms: linux/amd64
|
||||
|
||||
17
.github/workflows/lint.yml
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
name: Python linting
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- '*'
|
||||
pull_request:
|
||||
types: [ opened, synchronize ]
|
||||
|
||||
jobs:
|
||||
ruff:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Lint with Ruff
|
||||
uses: chartboost/ruff-action@v1
|
||||
30
.github/workflows/pytest.yml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Run python tests with pytest
|
||||
on: [push, pull_request]
|
||||
jobs:
|
||||
pytest_and_coverage:
|
||||
name: Run tests and count coverage
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.9", "3.10", "3.11"]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install pytest pytest-cov
|
||||
cd application
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
- name: Test with pytest and generate coverage report
|
||||
run: |
|
||||
python -m pytest --cov=application --cov=scripts --cov=extensions --cov-report=xml
|
||||
- name: Upload coverage reports to Codecov
|
||||
if: github.event_name == 'pull_request' && matrix.python-version == '3.11'
|
||||
uses: codecov/codecov-action@v3
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
|
||||
41
.github/workflows/sync_fork.yaml
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
name: Upstream Sync
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 0 * * *" # every hour
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
sync_latest_from_upstream:
|
||||
name: Sync latest commits from upstream repo
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ github.event.repository.fork }}
|
||||
|
||||
steps:
|
||||
# Step 1: run a standard checkout action
|
||||
- name: Checkout target repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
# Step 2: run the sync action
|
||||
- name: Sync upstream changes
|
||||
id: sync
|
||||
uses: aormsby/Fork-Sync-With-Upstream-action@v3.4
|
||||
with:
|
||||
# set your upstream repo and branch
|
||||
upstream_sync_repo: arc53/DocsGPT
|
||||
upstream_sync_branch: main
|
||||
target_sync_branch: main
|
||||
target_repo_token: ${{ secrets.GITHUB_TOKEN }} # automatically generated, no need to set
|
||||
|
||||
# Set test_mode true to run tests instead of the true action!!
|
||||
test_mode: false
|
||||
|
||||
- name: Sync check
|
||||
if: failure()
|
||||
run: |
|
||||
echo "::error::由于权限不足,导致同步失败(这是预期的行为),请前往仓库首页手动执行[Sync fork]。"
|
||||
echo "::error::Due to insufficient permissions, synchronization failed (as expected). Please go to the repository homepage and manually perform [Sync fork]."
|
||||
exit 1
|
||||
6
.gitignore
vendored
@@ -5,7 +5,7 @@ __pycache__/
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
*.next
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
@@ -169,4 +169,6 @@ application/vectors/
|
||||
|
||||
**/yarn.lock
|
||||
|
||||
node_modules/
|
||||
node_modules/
|
||||
.vscode/settings.json
|
||||
models/
|
||||
|
||||
2
.ruff.toml
Normal file
@@ -0,0 +1,2 @@
|
||||
# Allow lines to be as long as 120 characters.
|
||||
line-length = 120
|
||||
@@ -6,33 +6,39 @@ Thank you for choosing this project to contribute to, we are all very grateful!
|
||||
|
||||
📣 Discussions - where you can start a new topic or answer some questions
|
||||
|
||||
🐞 Issues - Is how we track tasks, sometimes its bugs that need fixing, sometimes its new features
|
||||
🐞 Issues - This is how we track tasks, sometimes it is bugs that need fixing, and sometimes it is new features
|
||||
|
||||
🛠️ Pull requests - Is how you can suggest changes to our repository, to work on existing issue or to add new features
|
||||
🛠️ Pull requests - This is how you can suggest changes to our repository, to work on existing issues or add new features
|
||||
|
||||
📚 Wiki - where we have our documentation
|
||||
|
||||
|
||||
## 🐞 Issues and Pull requests
|
||||
|
||||
We value contributions to our issues in form of discussion or suggestion, we recommend that you check out existing issues and our [Roadmap](https://github.com/orgs/arc53/projects/2)
|
||||
We value contributions to our issues in the form of discussion or suggestion, we recommend that you check out existing issues and our [Roadmap](https://github.com/orgs/arc53/projects/2)
|
||||
|
||||
If you want to contribute by writing code there are few things that you should know before doing it:
|
||||
If you want to contribute by writing code there are a few things that you should know before doing it:
|
||||
We have frontend (React, Vite) and Backend (python)
|
||||
|
||||
### If you are looking to contribute to Frontend (⚛️React, Vite):
|
||||
Current frontend is being migrated from /application to /frontend with a new design, so please contribute to the new on. Check out this [Milestone](https://github.com/arc53/DocsGPT/milestone/1) and its issues also [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1)
|
||||
Please try to follow guidelines
|
||||
|
||||
The current frontend is being migrated from /application to /frontend with a new design, so please contribute to the new one. Check out this [Milestone](https://github.com/arc53/DocsGPT/milestone/1) and its issues also [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1)
|
||||
Please try to follow the guidelines.
|
||||
|
||||
### If you are looking to contribute to Backend (🐍Python):
|
||||
Check out our issues, and contribute to /application or /scripts (ignore old ingest_rst.py ingest_rst_sphinx.py files, they will be deprecated soon)
|
||||
Currently we don't have any tests(which would be useful😉) but before submitting you PR make sure that after you ingested some test data its queryable
|
||||
* Check out our issues, and contribute to /application or /scripts (ignore old ingest_rst.py ingest_rst_sphinx.py files, they will be deprecated soon)
|
||||
* All new code should be covered with unit tests ([pytest](https://github.com/pytest-dev/pytest)). Please find tests under [/tests](https://github.com/arc53/DocsGPT/tree/main/tests) folder.
|
||||
* Before submitting your PR make sure that after you ingested some test data it is queryable.
|
||||
|
||||
### Testing
|
||||
To run unit tests, from the root of the repository execute:
|
||||
```
|
||||
python -m pytest
|
||||
```
|
||||
|
||||
### Workflow:
|
||||
Create a fork, make changes on your forked repository, submit changes in a form of pull request
|
||||
Create a fork, make changes on your forked repository, and submit changes in the form of a pull request.
|
||||
|
||||
## Questions / collaboration
|
||||
## Questions/collaboration
|
||||
Please join our [Discord](https://discord.gg/n5BX8dh8rU) don't hesitate, we are very friendly and welcoming to new contributors.
|
||||
|
||||
# Thank you so much for considering to contribute to DocsGPT!🙏
|
||||
# Thank you so much for considering contributing to DocsGPT!🙏
|
||||
|
||||
31
HACKTOBERFEST.md
Normal file
@@ -0,0 +1,31 @@
|
||||
🎉 Join the Hacktoberfest with DocsGPT and Earn a Free T-shirt! 🎉
|
||||
|
||||
Welcome, contributors! We're excited to announce that DocsGPT is participating in Hacktoberfest. Get involved by submitting a **meaningful** pull request, and earn a free shirt in return!
|
||||
📜 Here's How to Contribute:
|
||||
|
||||
🛠️ Code: This is the golden ticket! Make meaningful contributions through PRs.
|
||||
📚 Wiki: Improve our documentation, Create a guide or change existing documentation.
|
||||
🖥️ Design: Improve the UI/UX, or design a new feature.
|
||||
|
||||
📝 Guidelines for Pull Requests:
|
||||
|
||||
Familiarize yourself with the current contributions and our [Roadmap](https://github.com/orgs/arc53/projects/2).
|
||||
|
||||
Deciding to contribute with code? Here are some insights based on the area of your interest:
|
||||
|
||||
Frontend (⚛️React, Vite):
|
||||
Most of the code is located in /frontend folder. You can also check out our React extension in /extensions/react-widget.
|
||||
For design references, here's the [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1).
|
||||
Ensure you adhere to the established guidelines.
|
||||
|
||||
Backend (🐍Python):
|
||||
Focus on /application or /scripts. However, avoid the files ingest_rst.py and ingest_rst_sphinx.py as they are soon to be deprecated.
|
||||
Newly added code should come with relevant unit tests (pytest).
|
||||
Refer to the /tests folder for test suites.
|
||||
|
||||
Check out [Contributing Guidelines](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md)
|
||||
|
||||
|
||||
Don't be shy! Hop into our [Discord](https://discord.gg/n5BX8dh8rU) Server. We're a friendly bunch and eager to assist newcomers.
|
||||
|
||||
Big thanks for considering contributing to DocsGPT during Hacktoberfest! 🙏 Your effort can earn you a swanky new t-shirt. 🎁 Let's code together! 🚀
|
||||
135
README.md
@@ -18,82 +18,139 @@ Say goodbye to time-consuming manual searches, and let <strong>DocsGPT</strong>
|
||||
<a href="https://discord.gg/n5BX8dh8rU"></a>
|
||||
<a href="https://discord.gg/n5BX8dh8rU"></a>
|
||||
<a href="https://discord.gg/n5BX8dh8rU"></a>
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
### Enterprise Solutions:
|
||||
|
||||
When deploying your DocsGPT to a live environment, we're eager to provide personalized assistance. Reach out to us via email [here]( mailto:contact@arc53.com?subject=DocsGPT%20Enterprise&body=Hi%20we%20are%20%3CCompany%20name%3E%20and%20we%20want%20to%20build%20%3CSolution%3E%20with%20DocsGPT) to discuss your project further, and our team will connect with you shortly.
|
||||
|
||||
### [🎉 Join the Hacktoberfest with DocsGPT and Earn a Free T-shirt! 🎉](https://github.com/arc53/DocsGPT/blob/main/HACKTOBERFEST.md)
|
||||
|
||||

|
||||
|
||||
|
||||
## Roadmap
|
||||
|
||||
You can find our [Roadmap](https://github.com/orgs/arc53/projects/2) here. Please don't hesitate to contribute or create issues, it helps us make DocsGPT better!
|
||||
|
||||
## Our open source models optimised for DocsGPT:
|
||||
|
||||
| Name | Base Model | Requirements (or similar) |
|
||||
|-------------------|------------|----------------------------------------------------------|
|
||||
| [Docsgpt-7b-falcon](https://huggingface.co/Arc53/docsgpt-7b-falcon) | Falcon-7b | 1xA10G gpu |
|
||||
| [Docsgpt-14b](https://huggingface.co/Arc53/docsgpt-14b) | llama-2-14b | 2xA10 gpu's |
|
||||
| [Docsgpt-40b-falcon](https://huggingface.co/Arc53/docsgpt-40b-falcon) | falcon-40b | 8xA10G gpu's |
|
||||
|
||||
|
||||
If you don't have enough resources to run it you can use bitsnbytes to quantize
|
||||
|
||||
|
||||
## Features
|
||||
|
||||

|
||||
|
||||
|
||||
## Useful links
|
||||
[Live preview](https://docsgpt.arc53.com/)
|
||||
|
||||
[Join Our Discord](https://discord.gg/n5BX8dh8rU)
|
||||
|
||||
[Guides](https://docs.docsgpt.co.uk/)
|
||||
|
||||
## Roadmap
|
||||
[Interested in contributing?](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md)
|
||||
|
||||
You can find our [Roadmap](https://github.com/orgs/arc53/projects/2) here, please don't hesitate contributing or creating issues, it helps us make DocsGPT better!
|
||||
[How to use any other documentation](https://docs.docsgpt.co.uk/Guides/How-to-train-on-other-documentation)
|
||||
|
||||
|
||||
|
||||
## [Live preview](https://docsgpt.arc53.com/)
|
||||
|
||||
## [Join Our Discord](https://discord.gg/n5BX8dh8rU)
|
||||
[How to host it locally (so all data will stay on-premises)](https://docs.docsgpt.co.uk/Guides/How-to-use-different-LLM)
|
||||
|
||||
|
||||
## Project structure
|
||||
- Application - flask app (main application)
|
||||
- Application - Flask app (main application)
|
||||
|
||||
- Extensions - chrome extension
|
||||
- Extensions - Chrome extension
|
||||
|
||||
- Scripts - script that creates similarity search index and store for other libraries.
|
||||
- Scripts - Script that creates similarity search index and store for other libraries.
|
||||
|
||||
- frontend - frontend in vite and
|
||||
- Frontend - Frontend uses Vite and React
|
||||
|
||||
## QuickStart
|
||||
|
||||
Note: Make sure you have docker installed
|
||||
Note: Make sure you have Docker installed
|
||||
|
||||
1. Open dowload this repository with `git clone https://github.com/arc53/DocsGPT.git`
|
||||
2. Open docker-compose.yaml and replace <your_api_key> with your OpenAI's key (there are 4 places)
|
||||
3. Run `docker-compose build && docker-compose up`
|
||||
On Mac OS or Linux just write:
|
||||
|
||||
`./setup.sh`
|
||||
|
||||
It will install all the dependencies and give you an option to download local model or use OpenAI
|
||||
|
||||
Otherwise refer to this Guide:
|
||||
|
||||
1. Download and open this repository with `git clone https://github.com/arc53/DocsGPT.git`
|
||||
2. Create a .env file in your root directory and set the env variable OPENAI_API_KEY with your OpenAI API key and VITE_API_STREAMING to true or false, depending on if you want streaming answers or not
|
||||
It should look like this inside:
|
||||
|
||||
```
|
||||
OPENAI_API_KEY=Yourkey
|
||||
VITE_API_STREAMING=true
|
||||
SELF_HOSTED_MODEL=false
|
||||
```
|
||||
See optional environment variables in the `/.env-template` and `/application/.env_sample` files.
|
||||
3. Run `./run-with-docker-compose.sh`
|
||||
4. Navigate to http://localhost:5173/
|
||||
|
||||
To stop just run Ctrl + C
|
||||
|
||||
## Development environments
|
||||
|
||||
Spin up only 2 containers from docker-compose.yaml (by deleting all services except for redis and mongo)
|
||||
### Spin up mongo and redis
|
||||
For development only 2 containers are used from docker-compose.yaml (by deleting all services except for Redis and Mongo).
|
||||
See file [docker-compose-dev.yaml](./docker-compose-dev.yaml).
|
||||
|
||||
Make sure you have python 3.10 or 3.11 installed
|
||||
Run
|
||||
```
|
||||
docker compose -f docker-compose-dev.yaml build
|
||||
docker compose -f docker-compose-dev.yaml up -d
|
||||
```
|
||||
|
||||
1. Navigate to `/application` folder
|
||||
2. Install dependencies
|
||||
`pip install -r requirements.txt`
|
||||
3. Prepare .env file
|
||||
Copy .env_sample and create .env with your openai api token
|
||||
4. Run the app
|
||||
`python app.py`
|
||||
5. Start worker with `celery -A app.celery worker -l INFO`
|
||||
### Run the backend
|
||||
|
||||
Make sure you have Python 3.10 or 3.11 installed.
|
||||
|
||||
1. Export required environment variables
|
||||
```commandline
|
||||
export CELERY_BROKER_URL=redis://localhost:6379/0
|
||||
export CELERY_RESULT_BACKEND=redis://localhost:6379/1
|
||||
export MONGO_URI=mongodb://localhost:27017/docsgpt
|
||||
export FLASK_APP=application/app.py
|
||||
export FLASK_DEBUG=true
|
||||
```
|
||||
2. Prepare .env file
|
||||
Copy `.env_sample` and create `.env` with your OpenAI API token
|
||||
3. (optional) Create a Python virtual environment
|
||||
```commandline
|
||||
python -m venv venv
|
||||
. venv/bin/activate
|
||||
```
|
||||
4. Change to `application/` subdir and install dependencies for the backend
|
||||
```commandline
|
||||
pip install -r application/requirements.txt
|
||||
```
|
||||
5. Run the app `flask run --host=0.0.0.0 --port=7091`
|
||||
6. Start worker with `celery -A application.app.celery worker -l INFO`
|
||||
|
||||
### Start frontend
|
||||
Make sure you have Node version 16 or higher.
|
||||
|
||||
To start frontend
|
||||
1. Navigate to `/frontend` folder
|
||||
2. Install dependencies
|
||||
`npm install`
|
||||
3. In the file `.env.development` instead of `VITE_API_HOST = https://docsapi.arc53.com` use `VITE_API_HOST=http://localhost:5001`
|
||||
3. Run the app
|
||||
4. `npm run dev`
|
||||
3. Run the app
|
||||
`npm run dev`
|
||||
|
||||
|
||||
[How to install the Chrome extension](https://github.com/arc53/docsgpt/wiki#launch-chrome-extension)
|
||||
|
||||
|
||||
## [Guides](https://github.com/arc53/docsgpt/wiki)
|
||||
|
||||
## [Interested in contributing?](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md)
|
||||
|
||||
## [How to use any other documentation](https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation)
|
||||
|
||||
## [How to host it locally (so all data will stay on-premises)](https://github.com/arc53/DocsGPT/wiki/How-to-use-different-LLM's#hosting-everything-locally)
|
||||
|
||||
Built with [🦜️🔗 LangChain](https://github.com/hwchase17/langchain)
|
||||
|
||||
|
||||
@@ -3,4 +3,10 @@ EMBEDDINGS_KEY=your_api_key
|
||||
CELERY_BROKER_URL=redis://localhost:6379/0
|
||||
CELERY_RESULT_BACKEND=redis://localhost:6379/1
|
||||
MONGO_URI=mongodb://localhost:27017/docsgpt
|
||||
API_URL=http://localhost:5001
|
||||
API_URL=http://localhost:7091
|
||||
|
||||
#For OPENAI on Azure
|
||||
OPENAI_API_BASE=
|
||||
OPENAI_API_VERSION=
|
||||
AZURE_DEPLOYMENT_NAME=
|
||||
AZURE_EMBEDDINGS_DEPLOYMENT_NAME=
|
||||
@@ -4,22 +4,20 @@ FROM python:3.10-slim-bullseye as builder
|
||||
RUN apt-get update && apt-get install -y gcc curl
|
||||
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && apt-get install --reinstall libc6-dev -y
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
RUN pip install --upgrade pip && pip install tiktoken==0.1.2
|
||||
RUN pip install --upgrade pip && pip install tiktoken==0.3.3
|
||||
COPY requirements.txt .
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
|
||||
FROM python:3.10-slim-bullseye
|
||||
# Copy pre-built packages from builder stage
|
||||
COPY --from=builder /usr/local/lib/python3.10/site-packages/ /usr/local/lib/python3.10/site-packages/
|
||||
RUN pip install gunicorn==20.1.0
|
||||
RUN pip install celery==5.2.7
|
||||
|
||||
# Copy pre-built packages and binaries from builder stage
|
||||
COPY --from=builder /usr/local/ /usr/local/
|
||||
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
COPY . /app/application
|
||||
ENV FLASK_APP=app.py
|
||||
ENV FLASK_DEBUG=true
|
||||
|
||||
EXPOSE 7091
|
||||
|
||||
EXPOSE 5001
|
||||
|
||||
CMD ["gunicorn", "-w", "2", "--timeout", "120", "--bind", "0.0.0.0:5001", "wsgi:app"]
|
||||
CMD ["gunicorn", "-w", "2", "--timeout", "120", "--bind", "0.0.0.0:7091", "application.wsgi:app"]
|
||||
|
||||
0
application/__init__.py
Normal file
0
application/api/__init__.py
Normal file
0
application/api/answer/__init__.py
Normal file
337
application/api/answer/routes.py
Normal file
@@ -0,0 +1,337 @@
|
||||
import asyncio
|
||||
import os
|
||||
from flask import Blueprint, request, Response
|
||||
import json
|
||||
import datetime
|
||||
import logging
|
||||
import traceback
|
||||
|
||||
from pymongo import MongoClient
|
||||
from bson.objectid import ObjectId
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
|
||||
|
||||
from application.core.settings import settings
|
||||
from application.vectorstore.vector_creator import VectorCreator
|
||||
from application.llm.llm_creator import LLMCreator
|
||||
from application.error import bad_request
|
||||
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
db = mongo["docsgpt"]
|
||||
conversations_collection = db["conversations"]
|
||||
vectors_collection = db["vectors"]
|
||||
answer = Blueprint('answer', __name__)
|
||||
|
||||
if settings.LLM_NAME == "gpt4":
|
||||
gpt_model = 'gpt-4'
|
||||
else:
|
||||
gpt_model = 'gpt-3.5-turbo'
|
||||
|
||||
# load the prompts
|
||||
current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
with open(os.path.join(current_dir, "prompts", "combine_prompt.txt"), "r") as f:
|
||||
template = f.read()
|
||||
|
||||
with open(os.path.join(current_dir, "prompts", "combine_prompt_hist.txt"), "r") as f:
|
||||
template_hist = f.read()
|
||||
|
||||
with open(os.path.join(current_dir, "prompts", "question_prompt.txt"), "r") as f:
|
||||
template_quest = f.read()
|
||||
|
||||
with open(os.path.join(current_dir, "prompts", "chat_combine_prompt.txt"), "r") as f:
|
||||
chat_combine_template = f.read()
|
||||
|
||||
with open(os.path.join(current_dir, "prompts", "chat_reduce_prompt.txt"), "r") as f:
|
||||
chat_reduce_template = f.read()
|
||||
|
||||
api_key_set = settings.API_KEY is not None
|
||||
embeddings_key_set = settings.EMBEDDINGS_KEY is not None
|
||||
|
||||
|
||||
async def async_generate(chain, question, chat_history):
|
||||
result = await chain.arun({"question": question, "chat_history": chat_history})
|
||||
return result
|
||||
|
||||
|
||||
def count_tokens(string):
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
|
||||
return len(tokenizer(string)['input_ids'])
|
||||
|
||||
|
||||
def run_async_chain(chain, question, chat_history):
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
result = {}
|
||||
try:
|
||||
answer = loop.run_until_complete(async_generate(chain, question, chat_history))
|
||||
finally:
|
||||
loop.close()
|
||||
result["answer"] = answer
|
||||
return result
|
||||
|
||||
|
||||
def get_vectorstore(data):
|
||||
if "active_docs" in data:
|
||||
if data["active_docs"].split("/")[0] == "local":
|
||||
if data["active_docs"].split("/")[1] == "default":
|
||||
vectorstore = ""
|
||||
else:
|
||||
vectorstore = "indexes/" + data["active_docs"]
|
||||
else:
|
||||
vectorstore = "vectors/" + data["active_docs"]
|
||||
if data["active_docs"] == "default":
|
||||
vectorstore = ""
|
||||
else:
|
||||
vectorstore = ""
|
||||
vectorstore = os.path.join("application", vectorstore)
|
||||
return vectorstore
|
||||
|
||||
|
||||
# def get_docsearch(vectorstore, embeddings_key):
|
||||
# if settings.EMBEDDINGS_NAME == "openai_text-embedding-ada-002":
|
||||
# if is_azure_configured():
|
||||
# os.environ["OPENAI_API_TYPE"] = "azure"
|
||||
# openai_embeddings = OpenAIEmbeddings(model=settings.AZURE_EMBEDDINGS_DEPLOYMENT_NAME)
|
||||
# else:
|
||||
# openai_embeddings = OpenAIEmbeddings(openai_api_key=embeddings_key)
|
||||
# docsearch = FAISS.load_local(vectorstore, openai_embeddings)
|
||||
# elif settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2":
|
||||
# docsearch = FAISS.load_local(vectorstore, HuggingFaceHubEmbeddings())
|
||||
# elif settings.EMBEDDINGS_NAME == "huggingface_hkunlp/instructor-large":
|
||||
# docsearch = FAISS.load_local(vectorstore, HuggingFaceInstructEmbeddings())
|
||||
# elif settings.EMBEDDINGS_NAME == "cohere_medium":
|
||||
# docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key))
|
||||
# return docsearch
|
||||
|
||||
|
||||
def is_azure_configured():
|
||||
return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
|
||||
|
||||
|
||||
def complete_stream(question, docsearch, chat_history, api_key, conversation_id):
|
||||
llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=api_key)
|
||||
|
||||
|
||||
docs = docsearch.search(question, k=2)
|
||||
if settings.LLM_NAME == "llama.cpp":
|
||||
docs = [docs[0]]
|
||||
# join all page_content together with a newline
|
||||
docs_together = "\n".join([doc.page_content for doc in docs])
|
||||
p_chat_combine = chat_combine_template.replace("{summaries}", docs_together)
|
||||
messages_combine = [{"role": "system", "content": p_chat_combine}]
|
||||
source_log_docs = []
|
||||
for doc in docs:
|
||||
if doc.metadata:
|
||||
data = json.dumps({"type": "source", "doc": doc.page_content, "metadata": doc.metadata})
|
||||
source_log_docs.append({"title": doc.metadata['title'].split('/')[-1], "text": doc.page_content})
|
||||
else:
|
||||
data = json.dumps({"type": "source", "doc": doc.page_content})
|
||||
source_log_docs.append({"title": doc.page_content, "text": doc.page_content})
|
||||
yield f"data:{data}\n\n"
|
||||
|
||||
if len(chat_history) > 1:
|
||||
tokens_current_history = 0
|
||||
# count tokens in history
|
||||
chat_history.reverse()
|
||||
for i in chat_history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = count_tokens(i["prompt"]) + count_tokens(i["response"])
|
||||
if tokens_current_history + tokens_batch < settings.TOKENS_MAX_HISTORY:
|
||||
tokens_current_history += tokens_batch
|
||||
messages_combine.append({"role": "user", "content": i["prompt"]})
|
||||
messages_combine.append({"role": "system", "content": i["response"]})
|
||||
messages_combine.append({"role": "user", "content": question})
|
||||
|
||||
response_full = ""
|
||||
completion = llm.gen_stream(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME,
|
||||
messages=messages_combine)
|
||||
for line in completion:
|
||||
data = json.dumps({"answer": str(line)})
|
||||
response_full += str(line)
|
||||
yield f"data: {data}\n\n"
|
||||
|
||||
# save conversation to database
|
||||
if conversation_id is not None:
|
||||
conversations_collection.update_one(
|
||||
{"_id": ObjectId(conversation_id)},
|
||||
{"$push": {"queries": {"prompt": question, "response": response_full, "sources": source_log_docs}}},
|
||||
)
|
||||
|
||||
else:
|
||||
# create new conversation
|
||||
# generate summary
|
||||
messages_summary = [{"role": "assistant", "content": "Summarise following conversation in no more than 3 "
|
||||
"words, respond ONLY with the summary, use the same "
|
||||
"language as the system \n\nUser: " + question + "\n\n" +
|
||||
"AI: " +
|
||||
response_full},
|
||||
{"role": "user", "content": "Summarise following conversation in no more than 3 words, "
|
||||
"respond ONLY with the summary, use the same language as the "
|
||||
"system"}]
|
||||
|
||||
completion = llm.gen(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME,
|
||||
messages=messages_summary, max_tokens=30)
|
||||
conversation_id = conversations_collection.insert_one(
|
||||
{"user": "local",
|
||||
"date": datetime.datetime.utcnow(),
|
||||
"name": completion,
|
||||
"queries": [{"prompt": question, "response": response_full, "sources": source_log_docs}]}
|
||||
).inserted_id
|
||||
|
||||
# send data.type = "end" to indicate that the stream has ended as json
|
||||
data = json.dumps({"type": "id", "id": str(conversation_id)})
|
||||
yield f"data: {data}\n\n"
|
||||
data = json.dumps({"type": "end"})
|
||||
yield f"data: {data}\n\n"
|
||||
|
||||
|
||||
@answer.route("/stream", methods=["POST"])
|
||||
def stream():
|
||||
data = request.get_json()
|
||||
# get parameter from url question
|
||||
question = data["question"]
|
||||
history = data["history"]
|
||||
# history to json object from string
|
||||
history = json.loads(history)
|
||||
conversation_id = data["conversation_id"]
|
||||
|
||||
# check if active_docs is set
|
||||
|
||||
if not api_key_set:
|
||||
api_key = data["api_key"]
|
||||
else:
|
||||
api_key = settings.API_KEY
|
||||
if not embeddings_key_set:
|
||||
embeddings_key = data["embeddings_key"]
|
||||
else:
|
||||
embeddings_key = settings.EMBEDDINGS_KEY
|
||||
if "active_docs" in data:
|
||||
vectorstore = get_vectorstore({"active_docs": data["active_docs"]})
|
||||
else:
|
||||
vectorstore = ""
|
||||
docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key)
|
||||
|
||||
return Response(
|
||||
complete_stream(question, docsearch,
|
||||
chat_history=history, api_key=api_key,
|
||||
conversation_id=conversation_id), mimetype="text/event-stream"
|
||||
)
|
||||
|
||||
|
||||
@answer.route("/api/answer", methods=["POST"])
|
||||
def api_answer():
|
||||
data = request.get_json()
|
||||
question = data["question"]
|
||||
history = data["history"]
|
||||
if "conversation_id" not in data:
|
||||
conversation_id = None
|
||||
else:
|
||||
conversation_id = data["conversation_id"]
|
||||
print("-" * 5)
|
||||
if not api_key_set:
|
||||
api_key = data["api_key"]
|
||||
else:
|
||||
api_key = settings.API_KEY
|
||||
if not embeddings_key_set:
|
||||
embeddings_key = data["embeddings_key"]
|
||||
else:
|
||||
embeddings_key = settings.EMBEDDINGS_KEY
|
||||
|
||||
# use try and except to check for exception
|
||||
try:
|
||||
# check if the vectorstore is set
|
||||
vectorstore = get_vectorstore(data)
|
||||
# loading the index and the store and the prompt template
|
||||
# Note if you have used other embeddings than OpenAI, you need to change the embeddings
|
||||
docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key)
|
||||
|
||||
|
||||
llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=api_key)
|
||||
|
||||
|
||||
|
||||
docs = docsearch.search(question, k=2)
|
||||
# join all page_content together with a newline
|
||||
docs_together = "\n".join([doc.page_content for doc in docs])
|
||||
p_chat_combine = chat_combine_template.replace("{summaries}", docs_together)
|
||||
messages_combine = [{"role": "system", "content": p_chat_combine}]
|
||||
source_log_docs = []
|
||||
for doc in docs:
|
||||
if doc.metadata:
|
||||
source_log_docs.append({"title": doc.metadata['title'].split('/')[-1], "text": doc.page_content})
|
||||
else:
|
||||
source_log_docs.append({"title": doc.page_content, "text": doc.page_content})
|
||||
# join all page_content together with a newline
|
||||
|
||||
|
||||
if len(history) > 1:
|
||||
tokens_current_history = 0
|
||||
# count tokens in history
|
||||
history.reverse()
|
||||
for i in history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = count_tokens(i["prompt"]) + count_tokens(i["response"])
|
||||
if tokens_current_history + tokens_batch < settings.TOKENS_MAX_HISTORY:
|
||||
tokens_current_history += tokens_batch
|
||||
messages_combine.append({"role": "user", "content": i["prompt"]})
|
||||
messages_combine.append({"role": "system", "content": i["response"]})
|
||||
messages_combine.append({"role": "user", "content": question})
|
||||
|
||||
|
||||
completion = llm.gen(model=gpt_model, engine=settings.AZURE_DEPLOYMENT_NAME,
|
||||
messages=messages_combine)
|
||||
|
||||
|
||||
result = {"answer": completion, "sources": source_log_docs}
|
||||
logger.debug(result)
|
||||
|
||||
# generate conversationId
|
||||
if conversation_id is not None:
|
||||
conversations_collection.update_one(
|
||||
{"_id": ObjectId(conversation_id)},
|
||||
{"$push": {"queries": {"prompt": question,
|
||||
"response": result["answer"], "sources": result['sources']}}},
|
||||
)
|
||||
|
||||
else:
|
||||
# create new conversation
|
||||
# generate summary
|
||||
messages_summary = [
|
||||
{"role": "assistant", "content": "Summarise following conversation in no more than 3 words, "
|
||||
"respond ONLY with the summary, use the same language as the system \n\n"
|
||||
"User: " + question + "\n\n" + "AI: " + result["answer"]},
|
||||
{"role": "user", "content": "Summarise following conversation in no more than 3 words, "
|
||||
"respond ONLY with the summary, use the same language as the system"}
|
||||
]
|
||||
|
||||
completion = llm.gen(
|
||||
model=gpt_model,
|
||||
engine=settings.AZURE_DEPLOYMENT_NAME,
|
||||
messages=messages_summary,
|
||||
max_tokens=30
|
||||
)
|
||||
conversation_id = conversations_collection.insert_one(
|
||||
{"user": "local",
|
||||
"date": datetime.datetime.utcnow(),
|
||||
"name": completion,
|
||||
"queries": [{"prompt": question, "response": result["answer"], "sources": source_log_docs}]}
|
||||
).inserted_id
|
||||
|
||||
result["conversation_id"] = str(conversation_id)
|
||||
|
||||
# mock result
|
||||
# result = {
|
||||
# "answer": "The answer is 42",
|
||||
# "sources": ["https://en.wikipedia.org/wiki/42_(number)", "https://en.wikipedia.org/wiki/42_(number)"]
|
||||
# }
|
||||
return result
|
||||
except Exception as e:
|
||||
# print whole traceback
|
||||
traceback.print_exc()
|
||||
print(str(e))
|
||||
return bad_request(500, str(e))
|
||||
0
application/api/internal/__init__.py
Normal file
69
application/api/internal/routes.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import os
|
||||
import datetime
|
||||
from flask import Blueprint, request, send_from_directory
|
||||
from pymongo import MongoClient
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
|
||||
from application.core.settings import settings
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
db = mongo["docsgpt"]
|
||||
conversations_collection = db["conversations"]
|
||||
vectors_collection = db["vectors"]
|
||||
|
||||
current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
|
||||
internal = Blueprint('internal', __name__)
|
||||
@internal.route("/api/download", methods=["get"])
|
||||
def download_file():
|
||||
user = secure_filename(request.args.get("user"))
|
||||
job_name = secure_filename(request.args.get("name"))
|
||||
filename = secure_filename(request.args.get("file"))
|
||||
save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name)
|
||||
return send_from_directory(save_dir, filename, as_attachment=True)
|
||||
|
||||
|
||||
|
||||
@internal.route("/api/upload_index", methods=["POST"])
|
||||
def upload_index_files():
|
||||
"""Upload two files(index.faiss, index.pkl) to the user's folder."""
|
||||
if "user" not in request.form:
|
||||
return {"status": "no user"}
|
||||
user = secure_filename(request.form["user"])
|
||||
if "name" not in request.form:
|
||||
return {"status": "no name"}
|
||||
job_name = secure_filename(request.form["name"])
|
||||
save_dir = os.path.join(current_dir, "indexes", user, job_name)
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
if "file_faiss" not in request.files:
|
||||
print("No file part")
|
||||
return {"status": "no file"}
|
||||
file_faiss = request.files["file_faiss"]
|
||||
if file_faiss.filename == "":
|
||||
return {"status": "no file name"}
|
||||
if "file_pkl" not in request.files:
|
||||
print("No file part")
|
||||
return {"status": "no file"}
|
||||
file_pkl = request.files["file_pkl"]
|
||||
if file_pkl.filename == "":
|
||||
return {"status": "no file name"}
|
||||
# saves index files
|
||||
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
file_faiss.save(os.path.join(save_dir, "index.faiss"))
|
||||
file_pkl.save(os.path.join(save_dir, "index.pkl"))
|
||||
# create entry in vectors_collection
|
||||
vectors_collection.insert_one(
|
||||
{
|
||||
"user": user,
|
||||
"name": job_name,
|
||||
"language": job_name,
|
||||
"location": save_dir,
|
||||
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"type": "local",
|
||||
}
|
||||
)
|
||||
return {"status": "ok"}
|
||||
0
application/api/user/__init__.py
Normal file
226
application/api/user/routes.py
Normal file
@@ -0,0 +1,226 @@
|
||||
import os
|
||||
from flask import Blueprint, request, jsonify
|
||||
import requests
|
||||
import json
|
||||
from pymongo import MongoClient
|
||||
from bson.objectid import ObjectId
|
||||
from werkzeug.utils import secure_filename
|
||||
import http.client
|
||||
|
||||
from application.api.user.tasks import ingest
|
||||
|
||||
from application.core.settings import settings
|
||||
from application.vectorstore.vector_creator import VectorCreator
|
||||
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
db = mongo["docsgpt"]
|
||||
conversations_collection = db["conversations"]
|
||||
vectors_collection = db["vectors"]
|
||||
user = Blueprint('user', __name__)
|
||||
|
||||
current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
@user.route("/api/delete_conversation", methods=["POST"])
|
||||
def delete_conversation():
|
||||
# deletes a conversation from the database
|
||||
conversation_id = request.args.get("id")
|
||||
# write to mongodb
|
||||
conversations_collection.delete_one(
|
||||
{
|
||||
"_id": ObjectId(conversation_id),
|
||||
}
|
||||
)
|
||||
|
||||
return {"status": "ok"}
|
||||
|
||||
@user.route("/api/get_conversations", methods=["get"])
|
||||
def get_conversations():
|
||||
# provides a list of conversations
|
||||
conversations = conversations_collection.find().sort("date", -1)
|
||||
list_conversations = []
|
||||
for conversation in conversations:
|
||||
list_conversations.append({"id": str(conversation["_id"]), "name": conversation["name"]})
|
||||
|
||||
#list_conversations = [{"id": "default", "name": "default"}, {"id": "jeff", "name": "jeff"}]
|
||||
|
||||
return jsonify(list_conversations)
|
||||
|
||||
|
||||
@user.route("/api/get_single_conversation", methods=["get"])
|
||||
def get_single_conversation():
|
||||
# provides data for a conversation
|
||||
conversation_id = request.args.get("id")
|
||||
conversation = conversations_collection.find_one({"_id": ObjectId(conversation_id)})
|
||||
return jsonify(conversation['queries'])
|
||||
|
||||
|
||||
@user.route("/api/feedback", methods=["POST"])
|
||||
def api_feedback():
|
||||
data = request.get_json()
|
||||
question = data["question"]
|
||||
answer = data["answer"]
|
||||
feedback = data["feedback"]
|
||||
|
||||
print("-" * 5)
|
||||
print("Question: " + question)
|
||||
print("Answer: " + answer)
|
||||
print("Feedback: " + feedback)
|
||||
print("-" * 5)
|
||||
response = requests.post(
|
||||
url="https://86x89umx77.execute-api.eu-west-2.amazonaws.com/docsgpt-feedback",
|
||||
headers={
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
},
|
||||
data=json.dumps({"answer": answer, "question": question, "feedback": feedback}),
|
||||
)
|
||||
return {"status": http.client.responses.get(response.status_code, "ok")}
|
||||
|
||||
|
||||
@user.route("/api/delete_old", methods=["get"])
|
||||
def delete_old():
|
||||
"""Delete old indexes."""
|
||||
import shutil
|
||||
|
||||
path = request.args.get("path")
|
||||
dirs = path.split("/")
|
||||
dirs_clean = []
|
||||
for i in range(1, len(dirs)):
|
||||
dirs_clean.append(secure_filename(dirs[i]))
|
||||
# check that path strats with indexes or vectors
|
||||
if dirs[0] not in ["indexes", "vectors"]:
|
||||
return {"status": "error"}
|
||||
path_clean = "/".join(dirs)
|
||||
vectors_collection.delete_one({"location": path})
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
try:
|
||||
shutil.rmtree(os.path.join(current_dir, path_clean))
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
else:
|
||||
vetorstore = VectorCreator.create_vectorstore(
|
||||
settings.VECTOR_STORE, path=os.path.join(current_dir, path_clean)
|
||||
)
|
||||
vetorstore.delete_index()
|
||||
|
||||
return {"status": "ok"}
|
||||
|
||||
@user.route("/api/upload", methods=["POST"])
|
||||
def upload_file():
|
||||
"""Upload a file to get vectorized and indexed."""
|
||||
if "user" not in request.form:
|
||||
return {"status": "no user"}
|
||||
user = secure_filename(request.form["user"])
|
||||
if "name" not in request.form:
|
||||
return {"status": "no name"}
|
||||
job_name = secure_filename(request.form["name"])
|
||||
# check if the post request has the file part
|
||||
if "file" not in request.files:
|
||||
print("No file part")
|
||||
return {"status": "no file"}
|
||||
file = request.files["file"]
|
||||
if file.filename == "":
|
||||
return {"status": "no file name"}
|
||||
|
||||
if file:
|
||||
filename = secure_filename(file.filename)
|
||||
# save dir
|
||||
save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name)
|
||||
# create dir if not exists
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
|
||||
file.save(os.path.join(save_dir, filename))
|
||||
task = ingest.delay(settings.UPLOAD_FOLDER, [".rst", ".md", ".pdf", ".txt"], job_name, filename, user)
|
||||
# task id
|
||||
task_id = task.id
|
||||
return {"status": "ok", "task_id": task_id}
|
||||
else:
|
||||
return {"status": "error"}
|
||||
|
||||
@user.route("/api/task_status", methods=["GET"])
|
||||
def task_status():
|
||||
"""Get celery job status."""
|
||||
task_id = request.args.get("task_id")
|
||||
from application.celery import celery
|
||||
task = celery.AsyncResult(task_id)
|
||||
task_meta = task.info
|
||||
return {"status": task.status, "result": task_meta}
|
||||
|
||||
|
||||
@user.route("/api/combine", methods=["GET"])
|
||||
def combined_json():
|
||||
user = "local"
|
||||
"""Provide json file with combined available indexes."""
|
||||
# get json from https://d3dg1063dc54p9.cloudfront.net/combined.json
|
||||
|
||||
data = [
|
||||
{
|
||||
"name": "default",
|
||||
"language": "default",
|
||||
"version": "",
|
||||
"description": "default",
|
||||
"fullName": "default",
|
||||
"date": "default",
|
||||
"docLink": "default",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "local",
|
||||
}
|
||||
]
|
||||
# structure: name, language, version, description, fullName, date, docLink
|
||||
# append data from vectors_collection
|
||||
for index in vectors_collection.find({"user": user}):
|
||||
data.append(
|
||||
{
|
||||
"name": index["name"],
|
||||
"language": index["language"],
|
||||
"version": "",
|
||||
"description": index["name"],
|
||||
"fullName": index["name"],
|
||||
"date": index["date"],
|
||||
"docLink": index["location"],
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "local",
|
||||
}
|
||||
)
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json()
|
||||
for index in data_remote:
|
||||
index["location"] = "remote"
|
||||
data.append(index)
|
||||
|
||||
return jsonify(data)
|
||||
|
||||
|
||||
@user.route("/api/docs_check", methods=["POST"])
|
||||
def check_docs():
|
||||
# check if docs exist in a vectorstore folder
|
||||
data = request.get_json()
|
||||
# split docs on / and take first part
|
||||
if data["docs"].split("/")[0] == "local":
|
||||
return {"status": "exists"}
|
||||
vectorstore = "vectors/" + data["docs"]
|
||||
base_path = "https://raw.githubusercontent.com/arc53/DocsHUB/main/"
|
||||
if os.path.exists(vectorstore) or data["docs"] == "default":
|
||||
return {"status": "exists"}
|
||||
else:
|
||||
r = requests.get(base_path + vectorstore + "index.faiss")
|
||||
|
||||
if r.status_code != 200:
|
||||
return {"status": "null"}
|
||||
else:
|
||||
if not os.path.exists(vectorstore):
|
||||
os.makedirs(vectorstore)
|
||||
with open(vectorstore + "index.faiss", "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
# download the store
|
||||
r = requests.get(base_path + vectorstore + "index.pkl")
|
||||
with open(vectorstore + "index.pkl", "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
return {"status": "loaded"}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
7
application/api/user/tasks.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from application.worker import ingest_worker
|
||||
from application.celery import celery
|
||||
|
||||
@celery.task(bind=True)
|
||||
def ingest(self, directory, formats, name_job, filename, user):
|
||||
resp = ingest_worker(self, directory, formats, name_job, filename, user)
|
||||
return resp
|
||||
@@ -1,55 +1,19 @@
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import traceback
|
||||
import platform
|
||||
|
||||
|
||||
import dotenv
|
||||
import requests
|
||||
from celery import Celery
|
||||
from celery.result import AsyncResult
|
||||
from flask import Flask, request, render_template, send_from_directory, jsonify
|
||||
from langchain import FAISS
|
||||
from langchain import VectorDBQA, HuggingFaceHub, Cohere, OpenAI
|
||||
from langchain.chains import ChatVectorDBChain
|
||||
from langchain.chains.question_answering import load_qa_chain
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings, CohereEmbeddings, \
|
||||
HuggingFaceInstructEmbeddings
|
||||
from langchain.prompts import PromptTemplate
|
||||
from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
)
|
||||
from pymongo import MongoClient
|
||||
from werkzeug.utils import secure_filename
|
||||
from application.celery import celery
|
||||
from flask import Flask, request, redirect
|
||||
|
||||
from error import bad_request
|
||||
from worker import ingest_worker
|
||||
|
||||
# os.environ["LANGCHAIN_HANDLER"] = "langchain"
|
||||
from application.core.settings import settings
|
||||
from application.api.user.routes import user
|
||||
from application.api.answer.routes import answer
|
||||
from application.api.internal.routes import internal
|
||||
|
||||
if os.getenv("LLM_NAME") is not None:
|
||||
llm_choice = os.getenv("LLM_NAME")
|
||||
else:
|
||||
llm_choice = "openai_chat"
|
||||
|
||||
if os.getenv("EMBEDDINGS_NAME") is not None:
|
||||
embeddings_choice = os.getenv("EMBEDDINGS_NAME")
|
||||
else:
|
||||
embeddings_choice = "openai_text-embedding-ada-002"
|
||||
|
||||
if llm_choice == "manifest":
|
||||
from manifest import Manifest
|
||||
from langchain.llms.manifest import ManifestWrapper
|
||||
|
||||
manifest = Manifest(
|
||||
client_name="huggingface",
|
||||
client_connection="http://127.0.0.1:5000"
|
||||
)
|
||||
|
||||
# Redirect PosixPath to WindowsPath on Windows
|
||||
import platform
|
||||
|
||||
if platform.system() == "Windows":
|
||||
import pathlib
|
||||
@@ -60,379 +24,45 @@ if platform.system() == "Windows":
|
||||
# loading the .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
# load the prompts
|
||||
with open("prompts/combine_prompt.txt", "r") as f:
|
||||
template = f.read()
|
||||
|
||||
with open("prompts/combine_prompt_hist.txt", "r") as f:
|
||||
template_hist = f.read()
|
||||
|
||||
with open("prompts/question_prompt.txt", "r") as f:
|
||||
template_quest = f.read()
|
||||
|
||||
with open("prompts/chat_combine_prompt.txt", "r") as f:
|
||||
chat_combine_template = f.read()
|
||||
|
||||
with open("prompts/chat_reduce_prompt.txt", "r") as f:
|
||||
chat_reduce_template = f.read()
|
||||
|
||||
if os.getenv("API_KEY") is not None:
|
||||
api_key_set = True
|
||||
else:
|
||||
api_key_set = False
|
||||
if os.getenv("EMBEDDINGS_KEY") is not None:
|
||||
embeddings_key_set = True
|
||||
else:
|
||||
embeddings_key_set = False
|
||||
|
||||
app = Flask(__name__)
|
||||
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER = "inputs"
|
||||
app.config['CELERY_BROKER_URL'] = os.getenv("CELERY_BROKER_URL")
|
||||
app.config['CELERY_RESULT_BACKEND'] = os.getenv("CELERY_RESULT_BACKEND")
|
||||
app.config['MONGO_URI'] = os.getenv("MONGO_URI")
|
||||
celery = Celery(app.name, broker=app.config['CELERY_BROKER_URL'], backend=app.config['CELERY_RESULT_BACKEND'])
|
||||
celery.conf.update(app.config)
|
||||
mongo = MongoClient(app.config['MONGO_URI'])
|
||||
db = mongo["docsgpt"]
|
||||
vectors_collection = db["vectors"]
|
||||
app.register_blueprint(user)
|
||||
app.register_blueprint(answer)
|
||||
app.register_blueprint(internal)
|
||||
app.config["UPLOAD_FOLDER"] = UPLOAD_FOLDER = "inputs"
|
||||
app.config["CELERY_BROKER_URL"] = settings.CELERY_BROKER_URL
|
||||
app.config["CELERY_RESULT_BACKEND"] = settings.CELERY_RESULT_BACKEND
|
||||
app.config["MONGO_URI"] = settings.MONGO_URI
|
||||
celery.config_from_object("application.celeryconfig")
|
||||
|
||||
|
||||
@celery.task(bind=True)
|
||||
def ingest(self, directory, formats, name_job, filename, user):
|
||||
resp = ingest_worker(self, directory, formats, name_job, filename, user)
|
||||
return resp
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def home():
|
||||
return render_template("index.html", api_key_set=api_key_set, llm_choice=llm_choice,
|
||||
embeddings_choice=embeddings_choice)
|
||||
|
||||
|
||||
@app.route("/api/answer", methods=["POST"])
|
||||
def api_answer():
|
||||
data = request.get_json()
|
||||
question = data["question"]
|
||||
history = data["history"]
|
||||
print('-' * 5)
|
||||
if not api_key_set:
|
||||
api_key = data["api_key"]
|
||||
"""
|
||||
The frontend source code lives in the /frontend directory of the repository.
|
||||
"""
|
||||
if request.remote_addr in ('0.0.0.0', '127.0.0.1', 'localhost', '172.18.0.1'):
|
||||
# If users locally try to access DocsGPT running in Docker,
|
||||
# they will be redirected to the Frontend application.
|
||||
return redirect('http://localhost:5173')
|
||||
else:
|
||||
api_key = os.getenv("API_KEY")
|
||||
if not embeddings_key_set:
|
||||
embeddings_key = data["embeddings_key"]
|
||||
else:
|
||||
embeddings_key = os.getenv("EMBEDDINGS_KEY")
|
||||
|
||||
# use try and except to check for exception
|
||||
try:
|
||||
# check if the vectorstore is set
|
||||
if "active_docs" in data:
|
||||
if data["active_docs"].split("/")[0] == "local":
|
||||
vectorstore = "indexes/" + data["active_docs"]
|
||||
else:
|
||||
vectorstore = "vectors/" + data["active_docs"]
|
||||
if data['active_docs'] == "default":
|
||||
vectorstore = ""
|
||||
else:
|
||||
vectorstore = ""
|
||||
print(vectorstore)
|
||||
# vectorstore = "outputs/inputs/"
|
||||
# loading the index and the store and the prompt template
|
||||
# Note if you have used other embeddings than OpenAI, you need to change the embeddings
|
||||
if embeddings_choice == "openai_text-embedding-ada-002":
|
||||
docsearch = FAISS.load_local(vectorstore, OpenAIEmbeddings(openai_api_key=embeddings_key))
|
||||
elif embeddings_choice == "huggingface_sentence-transformers/all-mpnet-base-v2":
|
||||
docsearch = FAISS.load_local(vectorstore, HuggingFaceHubEmbeddings())
|
||||
elif embeddings_choice == "huggingface_hkunlp/instructor-large":
|
||||
docsearch = FAISS.load_local(vectorstore, HuggingFaceInstructEmbeddings())
|
||||
elif embeddings_choice == "cohere_medium":
|
||||
docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key))
|
||||
|
||||
# create a prompt template
|
||||
if history:
|
||||
history = json.loads(history)
|
||||
template_temp = template_hist.replace("{historyquestion}", history[0]).replace("{historyanswer}",
|
||||
history[1])
|
||||
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template_temp,
|
||||
template_format="jinja2")
|
||||
else:
|
||||
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template,
|
||||
template_format="jinja2")
|
||||
|
||||
q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
|
||||
template_format="jinja2")
|
||||
if llm_choice == "openai_chat":
|
||||
# llm = ChatOpenAI(openai_api_key=api_key, model_name="gpt-4")
|
||||
llm = ChatOpenAI(openai_api_key=api_key)
|
||||
messages_combine = [
|
||||
SystemMessagePromptTemplate.from_template(chat_combine_template),
|
||||
HumanMessagePromptTemplate.from_template("{question}")
|
||||
]
|
||||
p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
|
||||
messages_reduce = [
|
||||
SystemMessagePromptTemplate.from_template(chat_reduce_template),
|
||||
HumanMessagePromptTemplate.from_template("{question}")
|
||||
]
|
||||
p_chat_reduce = ChatPromptTemplate.from_messages(messages_reduce)
|
||||
elif llm_choice == "openai":
|
||||
llm = OpenAI(openai_api_key=api_key, temperature=0)
|
||||
elif llm_choice == "manifest":
|
||||
llm = ManifestWrapper(client=manifest, llm_kwargs={"temperature": 0.001, "max_tokens": 2048})
|
||||
elif llm_choice == "huggingface":
|
||||
llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key)
|
||||
elif llm_choice == "cohere":
|
||||
llm = Cohere(model="command-xlarge-nightly", cohere_api_key=api_key)
|
||||
|
||||
if llm_choice == "openai_chat":
|
||||
chain = ChatVectorDBChain.from_llm(
|
||||
llm=llm,
|
||||
vectorstore=docsearch,
|
||||
prompt=p_chat_combine,
|
||||
qa_prompt=p_chat_reduce,
|
||||
top_k_docs_for_context=3,
|
||||
return_source_documents=False)
|
||||
result = chain({"question": question, "chat_history": []})
|
||||
else:
|
||||
qa_chain = load_qa_chain(llm=llm, chain_type="map_reduce",
|
||||
combine_prompt=c_prompt, question_prompt=q_prompt)
|
||||
chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=docsearch, k=4)
|
||||
result = chain({"query": question})
|
||||
|
||||
print(result)
|
||||
|
||||
# some formatting for the frontend
|
||||
if "result" in result:
|
||||
result['answer'] = result['result']
|
||||
result['answer'] = result['answer'].replace("\\n", "\n")
|
||||
try:
|
||||
result['answer'] = result['answer'].split("SOURCES:")[0]
|
||||
except:
|
||||
pass
|
||||
|
||||
# mock result
|
||||
# result = {
|
||||
# "answer": "The answer is 42",
|
||||
# "sources": ["https://en.wikipedia.org/wiki/42_(number)", "https://en.wikipedia.org/wiki/42_(number)"]
|
||||
# }
|
||||
return result
|
||||
except Exception as e:
|
||||
# print whole traceback
|
||||
traceback.print_exc()
|
||||
print(str(e))
|
||||
return bad_request(500, str(e))
|
||||
# Handle other cases or render the default page
|
||||
return 'Welcome to DocsGPT Backend!'
|
||||
|
||||
|
||||
@app.route("/api/docs_check", methods=["POST"])
|
||||
def check_docs():
|
||||
# check if docs exist in a vectorstore folder
|
||||
data = request.get_json()
|
||||
# split docs on / and take first part
|
||||
if data["docs"].split("/")[0] == "local":
|
||||
return {"status": 'exists'}
|
||||
vectorstore = "vectors/" + data["docs"]
|
||||
base_path = 'https://raw.githubusercontent.com/arc53/DocsHUB/main/'
|
||||
if os.path.exists(vectorstore) or data["docs"] == "default":
|
||||
return {"status": 'exists'}
|
||||
else:
|
||||
r = requests.get(base_path + vectorstore + "index.faiss")
|
||||
|
||||
if r.status_code != 200:
|
||||
return {"status": 'null'}
|
||||
else:
|
||||
if not os.path.exists(vectorstore):
|
||||
os.makedirs(vectorstore)
|
||||
with open(vectorstore + "index.faiss", "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
# download the store
|
||||
r = requests.get(base_path + vectorstore + "index.pkl")
|
||||
with open(vectorstore + "index.pkl", "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
return {"status": 'loaded'}
|
||||
|
||||
|
||||
@app.route("/api/feedback", methods=["POST"])
|
||||
def api_feedback():
|
||||
data = request.get_json()
|
||||
question = data["question"]
|
||||
answer = data["answer"]
|
||||
feedback = data["feedback"]
|
||||
|
||||
print('-' * 5)
|
||||
print("Question: " + question)
|
||||
print("Answer: " + answer)
|
||||
print("Feedback: " + feedback)
|
||||
print('-' * 5)
|
||||
response = requests.post(
|
||||
url="https://86x89umx77.execute-api.eu-west-2.amazonaws.com/docsgpt-feedback",
|
||||
headers={
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
},
|
||||
data=json.dumps({
|
||||
"answer": answer,
|
||||
"question": question,
|
||||
"feedback": feedback
|
||||
})
|
||||
)
|
||||
return {"status": 'ok'}
|
||||
|
||||
|
||||
@app.route('/api/combine', methods=['GET'])
|
||||
def combined_json():
|
||||
user = 'local'
|
||||
"""Provide json file with combined available indexes."""
|
||||
# get json from https://d3dg1063dc54p9.cloudfront.net/combined.json
|
||||
|
||||
data = []
|
||||
# structure: name, language, version, description, fullName, date, docLink
|
||||
# append data from vectors_collection
|
||||
for index in vectors_collection.find({'user': user}):
|
||||
data.append({
|
||||
"name": index['name'],
|
||||
"language": index['language'],
|
||||
"version": '',
|
||||
"description": index['name'],
|
||||
"fullName": index['name'],
|
||||
"date": index['date'],
|
||||
"docLink": index['location'],
|
||||
"model": embeddings_choice,
|
||||
"location": "local"
|
||||
})
|
||||
|
||||
data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json()
|
||||
for index in data_remote:
|
||||
index['location'] = "remote"
|
||||
data.append(index)
|
||||
|
||||
return jsonify(data)
|
||||
|
||||
|
||||
@app.route('/api/upload', methods=['POST'])
|
||||
def upload_file():
|
||||
"""Upload a file to get vectorized and indexed."""
|
||||
if 'user' not in request.form:
|
||||
return {"status": 'no user'}
|
||||
user = secure_filename(request.form['user'])
|
||||
if 'name' not in request.form:
|
||||
return {"status": 'no name'}
|
||||
job_name = secure_filename(request.form['name'])
|
||||
# check if the post request has the file part
|
||||
if 'file' not in request.files:
|
||||
print('No file part')
|
||||
return {"status": 'no file'}
|
||||
file = request.files['file']
|
||||
if file.filename == '':
|
||||
return {"status": 'no file name'}
|
||||
|
||||
if file:
|
||||
filename = secure_filename(file.filename)
|
||||
# save dir
|
||||
save_dir = os.path.join(app.config['UPLOAD_FOLDER'], user, job_name)
|
||||
# create dir if not exists
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
|
||||
file.save(os.path.join(save_dir, filename))
|
||||
task = ingest.delay('temp', [".rst", ".md", ".pdf"], job_name, filename, user)
|
||||
# task id
|
||||
task_id = task.id
|
||||
return {"status": 'ok', "task_id": task_id}
|
||||
else:
|
||||
return {"status": 'error'}
|
||||
|
||||
|
||||
@app.route('/api/task_status', methods=['GET'])
|
||||
def task_status():
|
||||
"""Get celery job status."""
|
||||
task_id = request.args.get('task_id')
|
||||
task = AsyncResult(task_id)
|
||||
task_meta = task.info
|
||||
return {"status": task.status, "result": task_meta}
|
||||
|
||||
|
||||
### Backgound task api
|
||||
@app.route('/api/upload_index', methods=['POST'])
|
||||
def upload_index_files():
|
||||
"""Upload two files(index.faiss, index.pkl) to the user's folder."""
|
||||
if 'user' not in request.form:
|
||||
return {"status": 'no user'}
|
||||
user = secure_filename(request.form['user'])
|
||||
if 'name' not in request.form:
|
||||
return {"status": 'no name'}
|
||||
job_name = secure_filename(request.form['name'])
|
||||
if 'file_faiss' not in request.files:
|
||||
print('No file part')
|
||||
return {"status": 'no file'}
|
||||
file_faiss = request.files['file_faiss']
|
||||
if file_faiss.filename == '':
|
||||
return {"status": 'no file name'}
|
||||
if 'file_pkl' not in request.files:
|
||||
print('No file part')
|
||||
return {"status": 'no file'}
|
||||
file_pkl = request.files['file_pkl']
|
||||
if file_pkl.filename == '':
|
||||
return {"status": 'no file name'}
|
||||
|
||||
# saves index files
|
||||
save_dir = os.path.join('indexes', user, job_name)
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
file_faiss.save(os.path.join(save_dir, 'index.faiss'))
|
||||
file_pkl.save(os.path.join(save_dir, 'index.pkl'))
|
||||
# create entry in vectors_collection
|
||||
vectors_collection.insert_one({
|
||||
"user": user,
|
||||
"name": job_name,
|
||||
"language": job_name,
|
||||
"location": save_dir,
|
||||
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
|
||||
"model": embeddings_choice,
|
||||
"type": "local"
|
||||
})
|
||||
return {"status": 'ok'}
|
||||
|
||||
|
||||
@app.route('/api/download', methods=['get'])
|
||||
def download_file():
|
||||
user = secure_filename(request.args.get('user'))
|
||||
job_name = secure_filename(request.args.get('name'))
|
||||
filename = secure_filename(request.args.get('file'))
|
||||
save_dir = os.path.join(app.config['UPLOAD_FOLDER'], user, job_name)
|
||||
return send_from_directory(save_dir, filename, as_attachment=True)
|
||||
|
||||
|
||||
@app.route('/api/delete_old', methods=['get'])
|
||||
def delete_old():
|
||||
"""Delete old indexes."""
|
||||
import shutil
|
||||
path = request.args.get('path')
|
||||
dirs = path.split('/')
|
||||
dirs_clean = []
|
||||
for i in range(1, len(dirs)):
|
||||
dirs_clean.append(secure_filename(dirs[i]))
|
||||
# check that path strats with indexes or vectors
|
||||
if dirs[0] not in ['indexes', 'vectors']:
|
||||
return {"status": 'error'}
|
||||
path_clean = '/'.join(dirs)
|
||||
vectors_collection.delete_one({'location': path})
|
||||
try:
|
||||
shutil.rmtree(path_clean)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return {"status": 'ok'}
|
||||
|
||||
|
||||
# handling CORS
|
||||
@app.after_request
|
||||
def after_request(response):
|
||||
response.headers.add('Access-Control-Allow-Origin', '*')
|
||||
response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
|
||||
response.headers.add('Access-Control-Allow-Methods', 'GET,PUT,POST,DELETE,OPTIONS')
|
||||
response.headers.add('Access-Control-Allow-Credentials', 'true')
|
||||
response.headers.add("Access-Control-Allow-Origin", "*")
|
||||
response.headers.add("Access-Control-Allow-Headers", "Content-Type,Authorization")
|
||||
response.headers.add("Access-Control-Allow-Methods", "GET,PUT,POST,DELETE,OPTIONS")
|
||||
# response.headers.add("Access-Control-Allow-Credentials", "true")
|
||||
return response
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(debug=True, port=5001)
|
||||
app.run(debug=True, port=7091)
|
||||
|
||||
9
application/celery.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from celery import Celery
|
||||
from application.core.settings import settings
|
||||
|
||||
def make_celery(app_name=__name__):
|
||||
celery = Celery(app_name, broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND)
|
||||
celery.conf.update(settings)
|
||||
return celery
|
||||
|
||||
celery = make_celery()
|
||||
8
application/celeryconfig.py
Normal file
@@ -0,0 +1,8 @@
|
||||
import os
|
||||
|
||||
broker_url = os.getenv("CELERY_BROKER_URL")
|
||||
result_backend = os.getenv("CELERY_RESULT_BACKEND")
|
||||
|
||||
task_serializer = 'json'
|
||||
result_serializer = 'json'
|
||||
accept_content = ['json']
|
||||
0
application/core/__init__.py
Normal file
37
application/core/settings.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
from pydantic import BaseSettings
|
||||
current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
LLM_NAME: str = "openai"
|
||||
EMBEDDINGS_NAME: str = "openai_text-embedding-ada-002"
|
||||
CELERY_BROKER_URL: str = "redis://localhost:6379/0"
|
||||
CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
|
||||
MONGO_URI: str = "mongodb://localhost:27017/docsgpt"
|
||||
MODEL_PATH: str = os.path.join(current_dir, "models/docsgpt-7b-f16.gguf")
|
||||
TOKENS_MAX_HISTORY: int = 150
|
||||
UPLOAD_FOLDER: str = "inputs"
|
||||
VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch"
|
||||
|
||||
API_URL: str = "http://localhost:7091" # backend url for celery worker
|
||||
|
||||
API_KEY: str = None # LLM api key
|
||||
EMBEDDINGS_KEY: str = None # api key for embeddings (if using openai, just copy API_KEY
|
||||
OPENAI_API_BASE: str = None # azure openai api base url
|
||||
OPENAI_API_VERSION: str = None # azure openai api version
|
||||
AZURE_DEPLOYMENT_NAME: str = None # azure deployment name for answering
|
||||
AZURE_EMBEDDINGS_DEPLOYMENT_NAME: str = None # azure deployment name for embeddings
|
||||
|
||||
# elasticsearch
|
||||
ELASTIC_CLOUD_ID: str = None # cloud id for elasticsearch
|
||||
ELASTIC_USERNAME: str = None # username for elasticsearch
|
||||
ELASTIC_PASSWORD: str = None # password for elasticsearch
|
||||
ELASTIC_URL: str = None # url for elasticsearch
|
||||
ELASTIC_INDEX: str = "docsgpt" # index name for elasticsearch
|
||||
|
||||
|
||||
path = Path(__file__).parent.parent.absolute()
|
||||
settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8")
|
||||
@@ -1,13 +1,15 @@
|
||||
from flask import jsonify
|
||||
from werkzeug.http import HTTP_STATUS_CODES
|
||||
|
||||
def response_error(code_status,message=None):
|
||||
payload = {'error':HTTP_STATUS_CODES.get(code_status,"something went wrong")}
|
||||
|
||||
def response_error(code_status, message=None):
|
||||
payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")}
|
||||
if message:
|
||||
payload['message'] = message
|
||||
response = jsonify(payload)
|
||||
response.status_code = code_status
|
||||
return response
|
||||
|
||||
def bad_request(status_code=400,message=''):
|
||||
return response_error(code_status=status_code,message=message)
|
||||
|
||||
def bad_request(status_code=400, message=''):
|
||||
return response_error(code_status=status_code, message=message)
|
||||
|
||||
0
application/llm/__init__.py
Normal file
14
application/llm/base.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class BaseLLM(ABC):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def gen(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def gen_stream(self, *args, **kwargs):
|
||||
pass
|
||||
31
application/llm/huggingface.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from application.llm.base import BaseLLM
|
||||
|
||||
class HuggingFaceLLM(BaseLLM):
|
||||
|
||||
def __init__(self, api_key, llm_name='Arc53/DocsGPT-7B'):
|
||||
global hf
|
||||
|
||||
from langchain.llms import HuggingFacePipeline
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
||||
tokenizer = AutoTokenizer.from_pretrained(llm_name)
|
||||
model = AutoModelForCausalLM.from_pretrained(llm_name)
|
||||
pipe = pipeline(
|
||||
"text-generation", model=model,
|
||||
tokenizer=tokenizer, max_new_tokens=2000,
|
||||
device_map="auto", eos_token_id=tokenizer.eos_token_id
|
||||
)
|
||||
hf = HuggingFacePipeline(pipeline=pipe)
|
||||
|
||||
def gen(self, model, engine, messages, stream=False, **kwargs):
|
||||
context = messages[0]['content']
|
||||
user_question = messages[-1]['content']
|
||||
prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
|
||||
|
||||
result = hf(prompt)
|
||||
|
||||
return result.content
|
||||
|
||||
def gen_stream(self, model, engine, messages, stream=True, **kwargs):
|
||||
|
||||
raise NotImplementedError("HuggingFaceLLM Streaming is not implemented yet.")
|
||||
|
||||
39
application/llm/llama_cpp.py
Normal file
@@ -0,0 +1,39 @@
|
||||
from application.llm.base import BaseLLM
|
||||
from application.core.settings import settings
|
||||
|
||||
class LlamaCpp(BaseLLM):
|
||||
|
||||
def __init__(self, api_key, llm_name=settings.MODEL_PATH, **kwargs):
|
||||
global llama
|
||||
try:
|
||||
from llama_cpp import Llama
|
||||
except ImportError:
|
||||
raise ImportError("Please install llama_cpp using pip install llama-cpp-python")
|
||||
|
||||
llama = Llama(model_path=llm_name, n_ctx=2048)
|
||||
|
||||
def gen(self, model, engine, messages, stream=False, **kwargs):
|
||||
context = messages[0]['content']
|
||||
user_question = messages[-1]['content']
|
||||
prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
|
||||
|
||||
result = llama(prompt, max_tokens=150, echo=False)
|
||||
|
||||
# import sys
|
||||
# print(result['choices'][0]['text'].split('### Answer \n')[-1], file=sys.stderr)
|
||||
|
||||
return result['choices'][0]['text'].split('### Answer \n')[-1]
|
||||
|
||||
def gen_stream(self, model, engine, messages, stream=True, **kwargs):
|
||||
context = messages[0]['content']
|
||||
user_question = messages[-1]['content']
|
||||
prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
|
||||
|
||||
result = llama(prompt, max_tokens=150, echo=False, stream=stream)
|
||||
|
||||
# import sys
|
||||
# print(list(result), file=sys.stderr)
|
||||
|
||||
for item in result:
|
||||
for choice in item['choices']:
|
||||
yield choice['text']
|
||||
22
application/llm/llm_creator.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from application.llm.openai import OpenAILLM, AzureOpenAILLM
|
||||
from application.llm.sagemaker import SagemakerAPILLM
|
||||
from application.llm.huggingface import HuggingFaceLLM
|
||||
from application.llm.llama_cpp import LlamaCpp
|
||||
|
||||
|
||||
|
||||
class LLMCreator:
|
||||
llms = {
|
||||
'openai': OpenAILLM,
|
||||
'azure_openai': AzureOpenAILLM,
|
||||
'sagemaker': SagemakerAPILLM,
|
||||
'huggingface': HuggingFaceLLM,
|
||||
'llama.cpp': LlamaCpp
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def create_llm(cls, type, *args, **kwargs):
|
||||
llm_class = cls.llms.get(type.lower())
|
||||
if not llm_class:
|
||||
raise ValueError(f"No LLM class found for type {type}")
|
||||
return llm_class(*args, **kwargs)
|
||||
57
application/llm/openai.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from application.llm.base import BaseLLM
|
||||
from application.core.settings import settings
|
||||
|
||||
class OpenAILLM(BaseLLM):
|
||||
|
||||
def __init__(self, api_key):
|
||||
global openai
|
||||
import openai
|
||||
openai.api_key = api_key
|
||||
self.api_key = api_key # Save the API key to be used later
|
||||
|
||||
def _get_openai(self):
|
||||
# Import openai when needed
|
||||
import openai
|
||||
# Set the API key every time you import openai
|
||||
openai.api_key = self.api_key
|
||||
return openai
|
||||
|
||||
def gen(self, model, engine, messages, stream=False, **kwargs):
|
||||
response = openai.ChatCompletion.create(
|
||||
model=model,
|
||||
engine=engine,
|
||||
messages=messages,
|
||||
stream=stream,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
return response["choices"][0]["message"]["content"]
|
||||
|
||||
def gen_stream(self, model, engine, messages, stream=True, **kwargs):
|
||||
response = openai.ChatCompletion.create(
|
||||
model=model,
|
||||
engine=engine,
|
||||
messages=messages,
|
||||
stream=stream,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
for line in response:
|
||||
if "content" in line["choices"][0]["delta"]:
|
||||
yield line["choices"][0]["delta"]["content"]
|
||||
|
||||
|
||||
class AzureOpenAILLM(OpenAILLM):
|
||||
|
||||
def __init__(self, openai_api_key, openai_api_base, openai_api_version, deployment_name):
|
||||
super().__init__(openai_api_key)
|
||||
self.api_base = settings.OPENAI_API_BASE,
|
||||
self.api_version = settings.OPENAI_API_VERSION,
|
||||
self.deployment_name = settings.AZURE_DEPLOYMENT_NAME,
|
||||
|
||||
def _get_openai(self):
|
||||
openai = super()._get_openai()
|
||||
openai.api_base = self.api_base
|
||||
openai.api_version = self.api_version
|
||||
openai.api_type = "azure"
|
||||
return openai
|
||||
27
application/llm/sagemaker.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from application.llm.base import BaseLLM
|
||||
from application.core.settings import settings
|
||||
import requests
|
||||
import json
|
||||
|
||||
class SagemakerAPILLM(BaseLLM):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.url = settings.SAGEMAKER_API_URL
|
||||
|
||||
def gen(self, model, engine, messages, stream=False, **kwargs):
|
||||
context = messages[0]['content']
|
||||
user_question = messages[-1]['content']
|
||||
prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
|
||||
|
||||
response = requests.post(
|
||||
url=self.url,
|
||||
headers={
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
},
|
||||
data=json.dumps({"input": prompt})
|
||||
)
|
||||
|
||||
return response.json()['answer']
|
||||
|
||||
def gen_stream(self, model, engine, messages, stream=True, **kwargs):
|
||||
raise NotImplementedError("Sagemaker does not support streaming")
|
||||
1
application/parser/file/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
@@ -3,8 +3,7 @@ from abc import abstractmethod
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document as LCDocument
|
||||
|
||||
from parser.schema.base import Document
|
||||
from application.parser.schema.base import Document
|
||||
|
||||
|
||||
class BaseReader:
|
||||
|
||||
@@ -3,15 +3,15 @@ import logging
|
||||
from pathlib import Path
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
|
||||
from parser.file.base import BaseReader
|
||||
from parser.file.base_parser import BaseParser
|
||||
from parser.file.docs_parser import DocxParser, PDFParser
|
||||
from parser.file.epub_parser import EpubParser
|
||||
from parser.file.html_parser import HTMLParser
|
||||
from parser.file.markdown_parser import MarkdownParser
|
||||
from parser.file.rst_parser import RstParser
|
||||
from parser.file.tabular_parser import PandasCSVParser
|
||||
from parser.schema.base import Document
|
||||
from application.parser.file.base import BaseReader
|
||||
from application.parser.file.base_parser import BaseParser
|
||||
from application.parser.file.docs_parser import DocxParser, PDFParser
|
||||
from application.parser.file.epub_parser import EpubParser
|
||||
from application.parser.file.html_parser import HTMLParser
|
||||
from application.parser.file.markdown_parser import MarkdownParser
|
||||
from application.parser.file.rst_parser import RstParser
|
||||
from application.parser.file.tabular_parser import PandasCSVParser
|
||||
from application.parser.schema.base import Document
|
||||
|
||||
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
||||
".pdf": PDFParser(),
|
||||
@@ -52,17 +52,17 @@ class SimpleDirectoryReader(BaseReader):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_dir: Optional[str] = None,
|
||||
input_files: Optional[List] = None,
|
||||
exclude_hidden: bool = True,
|
||||
errors: str = "ignore",
|
||||
recursive: bool = True,
|
||||
required_exts: Optional[List[str]] = None,
|
||||
file_extractor: Optional[Dict[str, BaseParser]] = None,
|
||||
num_files_limit: Optional[int] = None,
|
||||
file_metadata: Optional[Callable[[str], Dict]] = None,
|
||||
chunk_size_max: int = 2048,
|
||||
self,
|
||||
input_dir: Optional[str] = None,
|
||||
input_files: Optional[List] = None,
|
||||
exclude_hidden: bool = True,
|
||||
errors: str = "ignore",
|
||||
recursive: bool = True,
|
||||
required_exts: Optional[List[str]] = None,
|
||||
file_extractor: Optional[Dict[str, BaseParser]] = None,
|
||||
num_files_limit: Optional[int] = None,
|
||||
file_metadata: Optional[Callable[[str], Dict]] = None,
|
||||
chunk_size_max: int = 2048,
|
||||
) -> None:
|
||||
"""Initialize with parameters."""
|
||||
super().__init__()
|
||||
@@ -102,8 +102,8 @@ class SimpleDirectoryReader(BaseReader):
|
||||
elif self.exclude_hidden and input_file.name.startswith("."):
|
||||
continue
|
||||
elif (
|
||||
self.required_exts is not None
|
||||
and input_file.suffix not in self.required_exts
|
||||
self.required_exts is not None
|
||||
and input_file.suffix not in self.required_exts
|
||||
):
|
||||
continue
|
||||
else:
|
||||
@@ -114,7 +114,7 @@ class SimpleDirectoryReader(BaseReader):
|
||||
new_input_files.extend(sub_input_files)
|
||||
|
||||
if self.num_files_limit is not None and self.num_files_limit > 0:
|
||||
new_input_files = new_input_files[0 : self.num_files_limit]
|
||||
new_input_files = new_input_files[0: self.num_files_limit]
|
||||
|
||||
# print total number of files added
|
||||
logging.debug(
|
||||
|
||||
@@ -6,7 +6,7 @@ Contains parsers for docx, pdf files.
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
from application.parser.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class PDFParser(BaseParser):
|
||||
|
||||
@@ -6,7 +6,7 @@ Contains parsers for epub files.
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
from application.parser.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class EpubParser(BaseParser):
|
||||
|
||||
@@ -7,7 +7,8 @@ import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, Union
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
from application.parser.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class HTMLParser(BaseParser):
|
||||
"""HTML parser."""
|
||||
@@ -23,38 +24,37 @@ class HTMLParser(BaseParser):
|
||||
Union[str, List[str]]: a string or a List of strings.
|
||||
"""
|
||||
try:
|
||||
import unstructured
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.staging.base import convert_to_isd
|
||||
from unstructured.cleaners.core import clean
|
||||
except ImportError:
|
||||
raise ValueError("unstructured package is required to parse HTML files.")
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.staging.base import convert_to_isd
|
||||
from unstructured.cleaners.core import clean
|
||||
|
||||
# Using the unstructured library to convert the html to isd format
|
||||
# isd sample : isd = [
|
||||
# {"text": "My Title", "type": "Title"},
|
||||
# {"text": "My Narrative", "type": "NarrativeText"}
|
||||
# ]
|
||||
# {"text": "My Title", "type": "Title"},
|
||||
# {"text": "My Narrative", "type": "NarrativeText"}
|
||||
# ]
|
||||
with open(file, "r", encoding="utf-8") as fp:
|
||||
elements = partition_html(file=fp)
|
||||
isd = convert_to_isd(elements)
|
||||
isd = convert_to_isd(elements)
|
||||
|
||||
# Removing non ascii charactwers from isd_el['text']
|
||||
# Removing non ascii charactwers from isd_el['text']
|
||||
for isd_el in isd:
|
||||
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
|
||||
|
||||
# Removing all the \n characters from isd_el['text'] using regex and replace with single space
|
||||
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
|
||||
for isd_el in isd:
|
||||
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
|
||||
isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
|
||||
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
|
||||
isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
|
||||
|
||||
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
|
||||
for isd_el in isd:
|
||||
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
|
||||
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
|
||||
|
||||
# Creating a list of all the indexes of isd_el['type'] = 'Title'
|
||||
title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
|
||||
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
|
||||
|
||||
# Creating 'Chunks' - List of lists of strings
|
||||
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
|
||||
@@ -64,19 +64,20 @@ class HTMLParser(BaseParser):
|
||||
Chunks = [[]]
|
||||
final_chunks = list(list())
|
||||
|
||||
for i,isd_el in enumerate(isd):
|
||||
for i, isd_el in enumerate(isd):
|
||||
if i in title_indexes:
|
||||
Chunks.append([])
|
||||
Chunks[-1].append(isd_el['text'])
|
||||
|
||||
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
|
||||
# Removing all the chunks with sum of length of all the strings in the chunk < 25
|
||||
# TODO: This value can be an user defined variable
|
||||
for chunk in Chunks:
|
||||
# sum of lenth of all the strings in the chunk
|
||||
# sum of length of all the strings in the chunk
|
||||
sum = 0
|
||||
sum += len(str(chunk))
|
||||
if sum < 25:
|
||||
Chunks.remove(chunk)
|
||||
else :
|
||||
else:
|
||||
# appending all the approved chunks to final_chunks as a single string
|
||||
final_chunks.append(" ".join([str(item) for item in chunk]))
|
||||
return final_chunks
|
||||
|
||||
@@ -7,8 +7,8 @@ import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
import tiktoken
|
||||
from application.parser.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class MarkdownParser(BaseParser):
|
||||
@@ -20,13 +20,13 @@ class MarkdownParser(BaseParser):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
remove_hyperlinks: bool = True,
|
||||
remove_images: bool = True,
|
||||
max_tokens: int = 2048,
|
||||
# remove_tables: bool = True,
|
||||
**kwargs: Any,
|
||||
self,
|
||||
*args: Any,
|
||||
remove_hyperlinks: bool = True,
|
||||
remove_images: bool = True,
|
||||
max_tokens: int = 2048,
|
||||
# remove_tables: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -35,8 +35,8 @@ class MarkdownParser(BaseParser):
|
||||
self._max_tokens = max_tokens
|
||||
# self._remove_tables = remove_tables
|
||||
|
||||
|
||||
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str):
|
||||
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
|
||||
current_text: str):
|
||||
"""Append to tups chunk."""
|
||||
num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
|
||||
if num_tokens > self._max_tokens:
|
||||
@@ -46,6 +46,7 @@ class MarkdownParser(BaseParser):
|
||||
else:
|
||||
tups.append((current_header, current_text))
|
||||
return tups
|
||||
|
||||
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
|
||||
"""Convert a markdown file to a dictionary.
|
||||
|
||||
@@ -115,7 +116,7 @@ class MarkdownParser(BaseParser):
|
||||
return {}
|
||||
|
||||
def parse_tups(
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
) -> List[Tuple[Optional[str], str]]:
|
||||
"""Parse file into tuples."""
|
||||
with open(filepath, "r") as f:
|
||||
@@ -130,7 +131,7 @@ class MarkdownParser(BaseParser):
|
||||
return markdown_tups
|
||||
|
||||
def parse_file(
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
) -> Union[str, List[str]]:
|
||||
"""Parse file into string."""
|
||||
tups = self.parse_tups(filepath, errors=errors)
|
||||
|
||||
@@ -5,10 +5,10 @@ Contains parser for md files.
|
||||
"""
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from application.parser.file.base_parser import BaseParser
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
import tiktoken
|
||||
|
||||
class RstParser(BaseParser):
|
||||
"""reStructuredText parser.
|
||||
@@ -19,17 +19,17 @@ class RstParser(BaseParser):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
remove_hyperlinks: bool = True,
|
||||
remove_images: bool = True,
|
||||
remove_table_excess: bool = True,
|
||||
remove_interpreters: bool = True,
|
||||
remove_directives: bool = True,
|
||||
remove_whitespaces_excess: bool = True,
|
||||
#Be carefull with remove_characters_excess, might cause data loss
|
||||
remove_characters_excess: bool = True,
|
||||
**kwargs: Any,
|
||||
self,
|
||||
*args: Any,
|
||||
remove_hyperlinks: bool = True,
|
||||
remove_images: bool = True,
|
||||
remove_table_excess: bool = True,
|
||||
remove_interpreters: bool = True,
|
||||
remove_directives: bool = True,
|
||||
remove_whitespaces_excess: bool = True,
|
||||
# Be careful with remove_characters_excess, might cause data loss
|
||||
remove_characters_excess: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -41,7 +41,6 @@ class RstParser(BaseParser):
|
||||
self._remove_whitespaces_excess = remove_whitespaces_excess
|
||||
self._remove_characters_excess = remove_characters_excess
|
||||
|
||||
|
||||
def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
|
||||
"""Convert a reStructuredText file to a dictionary.
|
||||
|
||||
@@ -56,7 +55,8 @@ class RstParser(BaseParser):
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
|
||||
if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
|
||||
if header_match and i > 0 and (
|
||||
len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
|
||||
if current_header is not None:
|
||||
if current_text == "" or None:
|
||||
continue
|
||||
@@ -72,7 +72,7 @@ class RstParser(BaseParser):
|
||||
|
||||
rst_tups.append((current_header, current_text))
|
||||
|
||||
#TODO: Format for rst
|
||||
# TODO: Format for rst
|
||||
#
|
||||
# if current_header is not None:
|
||||
# # pass linting, assert keys are defined
|
||||
@@ -136,7 +136,7 @@ class RstParser(BaseParser):
|
||||
return {}
|
||||
|
||||
def parse_tups(
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
) -> List[Tuple[Optional[str], str]]:
|
||||
"""Parse file into tuples."""
|
||||
with open(filepath, "r") as f:
|
||||
@@ -159,7 +159,7 @@ class RstParser(BaseParser):
|
||||
return rst_tups
|
||||
|
||||
def parse_file(
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
) -> Union[str, List[str]]:
|
||||
"""Parse file into string."""
|
||||
tups = self.parse_tups(filepath, errors=errors)
|
||||
|
||||
@@ -6,7 +6,7 @@ Contains parsers for tabular data files.
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
from application.parser.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class CSVParser(BaseParser):
|
||||
@@ -77,13 +77,13 @@ class PandasCSVParser(BaseParser):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
concat_rows: bool = True,
|
||||
col_joiner: str = ", ",
|
||||
row_joiner: str = "\n",
|
||||
pandas_config: dict = {},
|
||||
**kwargs: Any
|
||||
self,
|
||||
*args: Any,
|
||||
concat_rows: bool = True,
|
||||
col_joiner: str = ", ",
|
||||
row_joiner: str = "\n",
|
||||
pandas_config: dict = {},
|
||||
**kwargs: Any
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import os
|
||||
|
||||
import javalang
|
||||
|
||||
|
||||
def find_files(directory):
|
||||
files_list = []
|
||||
for root, dirs, files in os.walk(directory):
|
||||
@@ -9,6 +11,7 @@ def find_files(directory):
|
||||
files_list.append(os.path.join(root, file))
|
||||
return files_list
|
||||
|
||||
|
||||
def extract_functions(file_path):
|
||||
with open(file_path, "r") as file:
|
||||
java_code = file.read()
|
||||
@@ -28,6 +31,7 @@ def extract_functions(file_path):
|
||||
methods[method_name] = method_source_code
|
||||
return methods
|
||||
|
||||
|
||||
def extract_classes(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
source_code = file.read()
|
||||
@@ -47,6 +51,7 @@ def extract_classes(file_path):
|
||||
classes[class_name] = class_string
|
||||
return classes
|
||||
|
||||
|
||||
def extract_functions_and_classes(directory):
|
||||
files = find_files(directory)
|
||||
functions_dict = {}
|
||||
@@ -58,4 +63,4 @@ def extract_functions_and_classes(directory):
|
||||
classes = extract_classes(file)
|
||||
if classes:
|
||||
classes_dict[file] = classes
|
||||
return functions_dict, classes_dict
|
||||
return functions_dict, classes_dict
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import esprima
|
||||
|
||||
import escodegen
|
||||
import esprima
|
||||
|
||||
|
||||
def find_files(directory):
|
||||
@@ -11,6 +12,7 @@ def find_files(directory):
|
||||
files_list.append(os.path.join(root, file))
|
||||
return files_list
|
||||
|
||||
|
||||
def extract_functions(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
source_code = file.read()
|
||||
@@ -26,7 +28,6 @@ def extract_functions(file_path):
|
||||
func_name = declaration.id.name if declaration.id else '<anonymous>'
|
||||
functions[func_name] = escodegen.generate(declaration.init)
|
||||
elif node.type == 'ClassDeclaration':
|
||||
class_name = node.id.name
|
||||
for subnode in node.body.body:
|
||||
if subnode.type == 'MethodDefinition':
|
||||
func_name = subnode.key.name
|
||||
@@ -38,6 +39,7 @@ def extract_functions(file_path):
|
||||
functions[func_name] = escodegen.generate(declaration.init)
|
||||
return functions
|
||||
|
||||
|
||||
def extract_classes(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
source_code = file.read()
|
||||
@@ -53,6 +55,7 @@ def extract_classes(file_path):
|
||||
classes[class_name] = ", ".join(function_names)
|
||||
return classes
|
||||
|
||||
|
||||
def extract_functions_and_classes(directory):
|
||||
files = find_files(directory)
|
||||
functions_dict = {}
|
||||
|
||||
@@ -1,50 +1,62 @@
|
||||
import os
|
||||
import faiss
|
||||
import pickle
|
||||
|
||||
import tiktoken
|
||||
from langchain.vectorstores import FAISS
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
|
||||
#from langchain.embeddings import HuggingFaceEmbeddings
|
||||
#from langchain.embeddings import HuggingFaceInstructEmbeddings
|
||||
#from langchain.embeddings import CohereEmbeddings
|
||||
|
||||
from application.vectorstore.vector_creator import VectorCreator
|
||||
from application.core.settings import settings
|
||||
from retry import retry
|
||||
|
||||
|
||||
# from langchain.embeddings import HuggingFaceEmbeddings
|
||||
# from langchain.embeddings import HuggingFaceInstructEmbeddings
|
||||
# from langchain.embeddings import CohereEmbeddings
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
total_price = ((num_tokens/1000) * 0.0004)
|
||||
total_price = ((num_tokens / 1000) * 0.0004)
|
||||
return num_tokens, total_price
|
||||
|
||||
|
||||
@retry(tries=10, delay=60)
|
||||
def store_add_texts_with_retry(store, i):
|
||||
store.add_texts([i.page_content], metadatas=[i.metadata])
|
||||
#store_pine.add_texts([i.page_content], metadatas=[i.metadata])
|
||||
# store_pine.add_texts([i.page_content], metadatas=[i.metadata])
|
||||
|
||||
|
||||
def call_openai_api(docs, folder_name, task_status):
|
||||
# Function to create a vector store from the documents and save it to disk.
|
||||
# Function to create a vector store from the documents and save it to disk.
|
||||
|
||||
# create output folder if it doesn't exist
|
||||
if not os.path.exists(f"{folder_name}"):
|
||||
os.makedirs(f"{folder_name}")
|
||||
|
||||
from tqdm import tqdm
|
||||
docs_test = [docs[0]]
|
||||
docs.pop(0)
|
||||
c1 = 0
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
docs_init = [docs[0]]
|
||||
docs.pop(0)
|
||||
|
||||
store = FAISS.from_documents(docs_test, OpenAIEmbeddings(openai_api_key=os.getenv("EMBEDDINGS_KEY")))
|
||||
|
||||
store = VectorCreator.create_vectorstore(
|
||||
settings.VECTOR_STORE,
|
||||
docs_init = docs_init,
|
||||
path=f"{folder_name}",
|
||||
embeddings_key=os.getenv("EMBEDDINGS_KEY")
|
||||
)
|
||||
else:
|
||||
store = VectorCreator.create_vectorstore(
|
||||
settings.VECTOR_STORE,
|
||||
path=f"{folder_name}",
|
||||
embeddings_key=os.getenv("EMBEDDINGS_KEY")
|
||||
)
|
||||
# Uncomment for MPNet embeddings
|
||||
# model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||
# hf = HuggingFaceEmbeddings(model_name=model_name)
|
||||
# store = FAISS.from_documents(docs_test, hf)
|
||||
s1 = len(docs)
|
||||
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'):
|
||||
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
|
||||
bar_format='{l_bar}{bar}| Time Left: {remaining}'):
|
||||
try:
|
||||
task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)})
|
||||
store_add_texts_with_retry(store, i)
|
||||
@@ -56,22 +68,23 @@ def call_openai_api(docs, folder_name, task_status):
|
||||
store.save_local(f"{folder_name}")
|
||||
break
|
||||
c1 += 1
|
||||
store.save_local(f"{folder_name}")
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
store.save_local(f"{folder_name}")
|
||||
|
||||
|
||||
def get_user_permission(docs, folder_name):
|
||||
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
|
||||
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
|
||||
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
|
||||
#docs_content = (" ".join(docs))
|
||||
# docs_content = (" ".join(docs))
|
||||
docs_content = ""
|
||||
for doc in docs:
|
||||
docs_content += doc.page_content
|
||||
|
||||
|
||||
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
|
||||
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
|
||||
print(f"Number of Tokens = {format(tokens, ',d')}")
|
||||
print(f"Approx Cost = ${format(total_price, ',.2f')}")
|
||||
#Here we check for user permission before calling the API.
|
||||
# Here we check for user permission before calling the API.
|
||||
user_input = input("Price Okay? (Y/N) \n").lower()
|
||||
if user_input == "y":
|
||||
call_openai_api(docs, folder_name)
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
import os
|
||||
import ast
|
||||
import tiktoken
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import tiktoken
|
||||
from langchain.llms import OpenAI
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
|
||||
def find_files(directory):
|
||||
files_list = []
|
||||
for root, dirs, files in os.walk(directory):
|
||||
@@ -13,6 +15,7 @@ def find_files(directory):
|
||||
files_list.append(os.path.join(root, file))
|
||||
return files_list
|
||||
|
||||
|
||||
def extract_functions(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
source_code = file.read()
|
||||
@@ -25,6 +28,7 @@ def extract_functions(file_path):
|
||||
functions[func_name] = func_def
|
||||
return functions
|
||||
|
||||
|
||||
def extract_classes(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
source_code = file.read()
|
||||
@@ -40,6 +44,7 @@ def extract_classes(file_path):
|
||||
classes[class_name] = ", ".join(function_names)
|
||||
return classes
|
||||
|
||||
|
||||
def extract_functions_and_classes(directory):
|
||||
files = find_files(directory)
|
||||
functions_dict = {}
|
||||
@@ -53,11 +58,12 @@ def extract_functions_and_classes(directory):
|
||||
classes_dict[file] = classes
|
||||
return functions_dict, classes_dict
|
||||
|
||||
|
||||
def parse_functions(functions_dict, formats, dir):
|
||||
c1 = len(functions_dict)
|
||||
for i, (source, functions) in enumerate(functions_dict.items(), start=1):
|
||||
print(f"Processing file {i}/{c1}")
|
||||
source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
|
||||
source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
|
||||
subfolders = "/".join(source_w.split("/")[:-1])
|
||||
Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
|
||||
for j, (name, function) in enumerate(functions.items(), start=1):
|
||||
@@ -70,18 +76,19 @@ def parse_functions(functions_dict, formats, dir):
|
||||
response = llm(prompt.format(code=function))
|
||||
mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
|
||||
with open(f"outputs/{source_w}", mode) as f:
|
||||
f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
|
||||
f.write(
|
||||
f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
|
||||
|
||||
|
||||
def parse_classes(classes_dict, formats, dir):
|
||||
c1 = len(classes_dict)
|
||||
for i, (source, classes) in enumerate(classes_dict.items()):
|
||||
print(f"Processing file {i+1}/{c1}")
|
||||
source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
|
||||
print(f"Processing file {i + 1}/{c1}")
|
||||
source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
|
||||
subfolders = "/".join(source_w.split("/")[:-1])
|
||||
Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
|
||||
for name, function_names in classes.items():
|
||||
print(f"Processing Class {i+1}/{c1}")
|
||||
print(f"Processing Class {i + 1}/{c1}")
|
||||
prompt = PromptTemplate(
|
||||
input_variables=["class_name", "functions_names"],
|
||||
template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
|
||||
@@ -92,6 +99,7 @@ def parse_classes(classes_dict, formats, dir):
|
||||
with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
|
||||
f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
|
||||
|
||||
|
||||
def transform_to_docs(functions_dict, classes_dict, formats, dir):
|
||||
docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
|
||||
docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
|
||||
@@ -110,4 +118,4 @@ def transform_to_docs(functions_dict, classes_dict, formats, dir):
|
||||
parse_classes(classes_dict, formats, dir)
|
||||
print("All done!")
|
||||
else:
|
||||
print("The API was not called. No money was spent.")
|
||||
print("The API was not called. No money was spent.")
|
||||
|
||||
1
application/parser/schema/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
from langchain.docstore.document import Document as LCDocument
|
||||
|
||||
from parser.schema.schema import BaseDocument
|
||||
from application.parser.schema.schema import BaseDocument
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import re
|
||||
import tiktoken
|
||||
|
||||
from typing import List
|
||||
from parser.schema.base import Document
|
||||
from math import ceil
|
||||
from typing import List
|
||||
|
||||
import tiktoken
|
||||
from application.parser.schema.base import Document
|
||||
|
||||
|
||||
def separate_header_and_body(text):
|
||||
@@ -13,6 +13,7 @@ def separate_header_and_body(text):
|
||||
body = text[len(header):]
|
||||
return header, body
|
||||
|
||||
|
||||
def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
|
||||
docs = []
|
||||
current_group = None
|
||||
@@ -23,7 +24,8 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
|
||||
if current_group is None:
|
||||
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
|
||||
extra_info=doc.extra_info)
|
||||
elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
|
||||
elif len(tiktoken.get_encoding("cl100k_base").encode(
|
||||
current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
|
||||
current_group.text += " " + doc.text
|
||||
else:
|
||||
docs.append(current_group)
|
||||
@@ -35,6 +37,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
|
||||
docs = []
|
||||
for doc in documents:
|
||||
@@ -43,6 +46,9 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
|
||||
docs.append(doc)
|
||||
else:
|
||||
header, body = separate_header_and_body(doc.text)
|
||||
if len(tiktoken.get_encoding("cl100k_base").encode(header)) > max_tokens:
|
||||
body = doc.text
|
||||
header = ""
|
||||
num_body_parts = ceil(token_length / max_tokens)
|
||||
part_length = ceil(len(body) / num_body_parts)
|
||||
body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)]
|
||||
@@ -54,17 +60,18 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
|
||||
docs.append(new_doc)
|
||||
return docs
|
||||
|
||||
|
||||
def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
|
||||
if token_check == False:
|
||||
if not token_check:
|
||||
return documents
|
||||
print("Grouping small documents")
|
||||
try:
|
||||
documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
|
||||
except:
|
||||
except Exception:
|
||||
print("Grouping failed, try running without token_check")
|
||||
print("Separating large documents")
|
||||
try:
|
||||
documents = split_documents(documents=documents, max_tokens=max_tokens)
|
||||
except:
|
||||
except Exception:
|
||||
print("Grouping failed, try running without token_check")
|
||||
return documents
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
You are a DocsGPT, friendly and helpful AI assistant by Arc53 that provides help with documents. You give thorough answers with code examples if possible.
|
||||
Use the following pieces of context to help answer the users question.
|
||||
Use the following pieces of context to help answer the users question. If its not relevant to the question, provide friendly responses.
|
||||
You have access to chat history, and can use it to help answer the question.
|
||||
When using code examples, use the following format:
|
||||
```(language)
|
||||
(code)
|
||||
```
|
||||
----------------
|
||||
{summaries}
|
||||
@@ -1,3 +1,3 @@
|
||||
Use the following portion of a long document to see if any of the text is relevant to answer the question.
|
||||
{context}
|
||||
Provide all relevant text to the question verbatim. Summarize if needed. If nothing relevant return "-".
|
||||
Use the following pieces of context to help answer the users question. If its not relevant to the question, respond with "-"
|
||||
----------------
|
||||
{context}
|
||||
@@ -1,5 +1,5 @@
|
||||
aiodns==3.0.0
|
||||
aiohttp==3.8.4
|
||||
aiohttp==3.8.5
|
||||
aiohttp-retry==2.8.3
|
||||
aiosignal==1.3.1
|
||||
aleph-alpha-client==2.16.1
|
||||
@@ -8,48 +8,55 @@ async-timeout==4.0.2
|
||||
attrs==22.2.0
|
||||
billiard==3.6.4.0
|
||||
blobfile==2.0.1
|
||||
boto3==1.26.84
|
||||
botocore==1.29.84
|
||||
boto3==1.28.20
|
||||
celery==5.2.7
|
||||
cffi==1.15.1
|
||||
charset-normalizer==3.1.0
|
||||
click==8.1.3
|
||||
click-didyoumean==0.3.0
|
||||
click-plugins==1.1.1
|
||||
click-repl==0.2.0
|
||||
cryptography==39.0.2
|
||||
cryptography==41.0.3
|
||||
dataclasses-json==0.5.7
|
||||
decorator==5.1.1
|
||||
deeplake==3.2.13
|
||||
dill==0.3.6
|
||||
dnspython==2.3.0
|
||||
ecdsa==0.18.0
|
||||
elasticsearch==8.9.0
|
||||
entrypoints==0.4
|
||||
faiss-cpu==1.7.3
|
||||
filelock==3.9.0
|
||||
Flask==2.2.3
|
||||
Flask==2.2.5
|
||||
Flask-Cors==3.0.10
|
||||
frozenlist==1.3.3
|
||||
geojson==2.5.0
|
||||
gunicorn==20.1.0
|
||||
greenlet==2.0.2
|
||||
hub==3.0.1
|
||||
huggingface-hub==0.12.1
|
||||
humbug==0.2.8
|
||||
gpt4all==0.1.7
|
||||
huggingface-hub==0.15.1
|
||||
humbug==0.3.2
|
||||
idna==3.4
|
||||
itsdangerous==2.1.2
|
||||
Jinja2==3.1.2
|
||||
jmespath==1.0.1
|
||||
joblib==1.2.0
|
||||
kombu==5.2.4
|
||||
langchain==0.0.118
|
||||
langchain==0.0.263
|
||||
loguru==0.6.0
|
||||
lxml==4.9.2
|
||||
MarkupSafe==2.1.2
|
||||
marshmallow==3.19.0
|
||||
marshmallow-enum==1.5.1
|
||||
mpmath==1.3.0
|
||||
multidict==6.0.4
|
||||
multiprocess==0.70.14
|
||||
mypy-extensions==1.0.0
|
||||
networkx==3.0
|
||||
npx
|
||||
nltk==3.8.1
|
||||
numcodecs==0.11.0
|
||||
numpy==1.24.2
|
||||
openai==0.27.0
|
||||
openai==0.27.8
|
||||
packaging==23.0
|
||||
pathos==0.3.0
|
||||
Pillow==9.4.0
|
||||
@@ -61,32 +68,39 @@ pyasn1==0.4.8
|
||||
pycares==4.3.0
|
||||
pycparser==2.21
|
||||
pycryptodomex==3.17
|
||||
pycryptodome==3.19.0
|
||||
pydantic==1.10.5
|
||||
PyJWT==2.6.0
|
||||
pymongo==4.3.3
|
||||
pyowm==3.3.0
|
||||
PyPDF2==3.0.1
|
||||
PySocks==1.7.1
|
||||
pytest
|
||||
python-dateutil==2.8.2
|
||||
python-dotenv==1.0.0
|
||||
python-jose==3.3.0
|
||||
pytz==2022.7.1
|
||||
PyYAML==6.0
|
||||
redis==4.5.2
|
||||
redis==4.5.4
|
||||
regex==2022.10.31
|
||||
requests==2.28.2
|
||||
requests==2.31.0
|
||||
retry==0.9.2
|
||||
rsa==4.9
|
||||
s3transfer==0.6.0
|
||||
scikit-learn==1.2.2
|
||||
scipy==1.10.1
|
||||
sentencepiece
|
||||
six==1.16.0
|
||||
SQLAlchemy==1.4.46
|
||||
sympy==1.11.1
|
||||
tenacity==8.2.2
|
||||
tiktoken==0.3.0
|
||||
tokenizers==0.13.2
|
||||
threadpoolctl==3.1.0
|
||||
tiktoken
|
||||
tqdm==4.65.0
|
||||
transformers==4.26.1
|
||||
transformers==4.30.0
|
||||
typer==0.7.0
|
||||
typing-inspect==0.8.0
|
||||
typing_extensions==4.5.0
|
||||
urllib3==1.26.14
|
||||
vine==5.0.0
|
||||
wcwidth==0.2.6
|
||||
Werkzeug==2.2.3
|
||||
yarl==1.8.2
|
||||
|
||||
|
Before Width: | Height: | Size: 37 KiB |
|
Before Width: | Height: | Size: 352 KiB |
|
Before Width: | Height: | Size: 34 KiB |
|
Before Width: | Height: | Size: 631 B |
|
Before Width: | Height: | Size: 1.7 KiB |
|
Before Width: | Height: | Size: 15 KiB |
@@ -1 +0,0 @@
|
||||
{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}
|
||||
@@ -1,19 +0,0 @@
|
||||
function resetApiKey() {
|
||||
const modal = document.getElementById("modal");
|
||||
modal.classList.toggle("hidden");
|
||||
}
|
||||
|
||||
const apiKeyForm = document.getElementById("api-key-form");
|
||||
if (apiKeyForm) {
|
||||
apiKeyForm.addEventListener("submit", function(event) {
|
||||
event.preventDefault();
|
||||
|
||||
const apiKeyInput = document.getElementById("api-key-input");
|
||||
const apiKey = apiKeyInput.value;
|
||||
|
||||
localStorage.setItem("apiKey", apiKey);
|
||||
|
||||
apiKeyInput.value = "";
|
||||
modal.classList.toggle("hidden");
|
||||
});
|
||||
}
|
||||
@@ -1,76 +0,0 @@
|
||||
var form = document.getElementById('message-form');
|
||||
var errorModal = document.getElementById('error-alert')
|
||||
document.getElementById('close').addEventListener('click',()=>{
|
||||
errorModal.classList.toggle('hidden')
|
||||
})
|
||||
|
||||
|
||||
function submitForm(event){
|
||||
event.preventDefault()
|
||||
var message = document.getElementById("message-input").value;
|
||||
console.log(message.length)
|
||||
if(message.length === 0){
|
||||
return
|
||||
}
|
||||
msg_html = '<div class="bg-blue-500 text-white p-2 rounded-lg mb-2 self-end"><p class="text-sm">'
|
||||
msg_html += message
|
||||
msg_html += '</p></div>'
|
||||
document.getElementById("messages").innerHTML += msg_html;
|
||||
let chatWindow = document.getElementById("messages-container");
|
||||
chatWindow.scrollTop = chatWindow.scrollHeight;
|
||||
document.getElementById("message-input").value = "";
|
||||
document.getElementById("button-submit").innerHTML = '<i class="fa fa-circle-o-notch fa-spin"></i> Thinking...';
|
||||
document.getElementById("button-submit").disabled = true;
|
||||
if (localStorage.getItem('activeDocs') == null) {
|
||||
localStorage.setItem('activeDocs', 'default')
|
||||
}
|
||||
|
||||
|
||||
fetch('/api/answer', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
|
||||
body: JSON.stringify({question: message,
|
||||
api_key: localStorage.getItem('apiKey'),
|
||||
embeddings_key: localStorage.getItem('apiKey'),
|
||||
history: localStorage.getItem('chatHistory'),
|
||||
active_docs: localStorage.getItem('activeDocs')}),
|
||||
}).then((response)=> response.json())
|
||||
.then(data => {
|
||||
console.log('Success:', data);
|
||||
if(data.error){
|
||||
document.getElementById('text-error').textContent = `Error : ${JSON.stringify(data.message)}`
|
||||
errorModal.classList.toggle('hidden')
|
||||
}
|
||||
if(data.answer){
|
||||
msg_html = '<div class="bg-indigo-500 text-white p-2 rounded-lg mb-2 self-start"><code class="text-sm">'
|
||||
data.answer = data.answer.replace(/\n/g, "<br>");
|
||||
msg_html += data.answer
|
||||
msg_html += '</code></div>'
|
||||
document.getElementById("messages").innerHTML += msg_html;
|
||||
let chatWindow = document.getElementById("messages-container");
|
||||
chatWindow.scrollTop = chatWindow.scrollHeight;
|
||||
}
|
||||
document.getElementById("button-submit").innerHTML = 'Send';
|
||||
document.getElementById("button-submit").disabled = false;
|
||||
let chatHistory = [message, data.answer || ''];
|
||||
localStorage.setItem('chatHistory', JSON.stringify(chatHistory));
|
||||
|
||||
|
||||
|
||||
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error('Error:', error);
|
||||
// console.log(error);
|
||||
// document.getElementById("button-submit").innerHTML = 'Send';
|
||||
// document.getElementById("button-submit").disabled = false;
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
//window.addEventListener('submit',submitForm)
|
||||
// rewrite using id = button-submit
|
||||
document.getElementById("button-submit").addEventListener('click',submitForm)
|
||||
@@ -1,15 +0,0 @@
|
||||
document.getElementById("select-docs").addEventListener("change", function() {
|
||||
localStorage.setItem('activeDocs', this.value)
|
||||
fetch('/api/docs_check', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({docs: this.value}),
|
||||
}).then(response => response.json()).then(
|
||||
data => {
|
||||
console.log('Success:', data);
|
||||
}
|
||||
)
|
||||
});
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
@tailwind base;
|
||||
@tailwind components;
|
||||
@tailwind utilities;
|
||||
|
||||
|
||||
|
||||
|
||||
@media screen and (max-width: 1024px) {
|
||||
.text-lg {
|
||||
font-size: 3.125rem;
|
||||
margin: 2rem;
|
||||
line-height: inherit;
|
||||
}
|
||||
.text-sm {
|
||||
font-size: 2.5rem;
|
||||
margin: 1.5rem;
|
||||
line-height: inherit;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
.loader {
|
||||
border: 16px solid #f3f3f3; /* Light grey */
|
||||
border-top: 16px solid #3498db; /* Blue */
|
||||
border-radius: 50%;
|
||||
width: 120px;
|
||||
height: 120px;
|
||||
animation: spin 2s linear infinite;
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
0% { transform: rotate(0deg); }
|
||||
100% { transform: rotate(360deg); }
|
||||
}
|
||||
|
||||
|
||||
@@ -1,215 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>DocsGPT 🦖 Preview</title>
|
||||
<link href="{{url_for('static',filename='dist/css/output.css')}}" rel="stylesheet">
|
||||
<link rel="favicon" href="{{ url_for('static', filename='favicon/favicon.ico') }}">
|
||||
<link rel="apple-touch-icon" sizes="180x180" href="{{ url_for('static', filename='favicon/apple-touch-icon.png') }}">
|
||||
<link rel="icon" type="image/png" sizes="32x32" href="{{ url_for('static', filename='favicon/favicon-32x32.png') }}">
|
||||
<link rel="icon" type="image/png" sizes="16x16" href="{{ url_for('static', filename='favicon/favicon-16x16.png') }}">
|
||||
<link rel="manifest" href="{{ url_for('static', filename='favicon//site.webmanifest') }}">
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
<body>
|
||||
|
||||
|
||||
|
||||
<header class="bg-white p-2 flex justify-between items-center">
|
||||
<h1 class="text-lg font-medium">DocsGPT 🦖 Preview</h1>
|
||||
<div>
|
||||
<a href="https://github.com/arc53/docsgpt" class="text-blue-500 hover:text-blue-800 text-sm">About</a>
|
||||
{% if not api_key_set %}
|
||||
<button class="text-sm text-yellow-500 hover:text-yellow-800" onclick="resetApiKey()">Reset Key</button>
|
||||
{% endif %}
|
||||
</div>
|
||||
</header>
|
||||
|
||||
|
||||
<!-- Alert Info -->
|
||||
<div class="border flex justify-between
|
||||
w-auto px-4 py-3 rounded relative
|
||||
hidden" style="background-color: rgb(197, 51, 51);color: white;" id="error-alert" role="alert">
|
||||
<span class="block sm:inline" id="text-error"></span>
|
||||
<strong class="text-xl align-center alert-del" style="cursor: pointer;" id="close">×</strong>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="lg:flex ml-2 mr-2">
|
||||
<div class="lg:w-3/4 min-h-screen max-h-screen">
|
||||
<div class="w-full flex flex-col h-5/6">
|
||||
<div id="messages-container" style="overflow: auto;" class="sm:max-lg:mb-[12rem]">
|
||||
|
||||
<div id="messages" class="w-full flex flex-col mt-2" >
|
||||
<div class="bg-indigo-500 text-white p-2 rounded-lg mb-2 self-start">
|
||||
<p class="text-sm">Hello, ask me anything about this library. Im here to help</p>
|
||||
</div>
|
||||
<div class="bg-blue-500 text-white p-2 rounded-lg mb-2 self-end">
|
||||
<p class="text-sm">How to merge tables?</p>
|
||||
</div>
|
||||
<div class="bg-indigo-500 text-white p-2 rounded-lg mb-2 self-start">
|
||||
<p class="text-sm">To merge two tables in pandas, you can use the pd.merge() function. The basic syntax is:<br>
|
||||
pd.merge(left, right, on, how)<br>
|
||||
where left and right are the two tables to merge, on is the column to merge on, and how is the type of merge to perform.<br>
|
||||
For example, to merge the two tables df1 and df2 on the column 'key', you can use:<br>
|
||||
pd.merge(df1, df2, on='key', how='left')<br>
|
||||
This will return a new DataFrame with all the columns from both tables, and only the rows that match the 'key' column. </p>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="fixed bottom-0 w-full mt-4 mb-2 lg:w-3/4">
|
||||
<form id="message-form" autocomplete="off" class="flex items-stretch">
|
||||
<input autocomplete="off" id="message-input" class="bg-white p-2 rounded-lg ml-2 text-sm w-full" type="text" placeholder="Type your message here...">
|
||||
<button id="button-submit" class="bg-blue-500 text-white p-2 rounded-lg ml-2 mr-2 text-sm sm:max-lg:p-5" type="submit">Send</button>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<div class="lg:w-1/4 p-2 sm:max-lg:hidden">
|
||||
<p class="text-sm">This is a chatbot that uses the GPT-3, Faiss and <a href="https://github.com/hwchase17/langchain" class="text-blue-500 hover:text-blue-800">LangChain</a> to answer questions</p>
|
||||
<br>
|
||||
<p class="text-sm">The source code is available on <a href="https://github.com/arc53/docsgpt" class="text-blue-500 hover:text-blue-800">Github</a></p><br>
|
||||
<p class="text-sm">Currently It uses python pandas documentation, so it will respond to information relevant to pandas. If you want to train it on different documentation - <a href="https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation" class="text-blue-500 hover:text-blue-800"> please follow this guide </a></p><br>
|
||||
<p class="text-sm">If you want to launch it on your own server - <a href="https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation" class="text-blue-500 hover:text-blue-800"> follow this guide </a></p><br>
|
||||
<label class="block mb-2 text-sm font-medium text-gray-900">Select documentation from DocsHUB</label>
|
||||
<select id="select-docs" class="bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2.5">
|
||||
<option selected value="default">Choose documentation</option>
|
||||
<option value="default">Default</option>
|
||||
</select>
|
||||
<form action="/api/upload" method="post" enctype="multipart/form-data" class="mt-2">
|
||||
<input type="file" name="file" class="py-4" id="file-upload">
|
||||
<input type="text" name="user" value="local" hidden>
|
||||
<input type="text" name="name" placeholder="Name:">
|
||||
|
||||
|
||||
<button type="submit" class="py-2 px-4 text-white bg-blue-500 rounded-md hover:bg-blue-600 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-blue-500">
|
||||
Upload
|
||||
</button>
|
||||
</form>
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="flex items-center justify-center h-full">
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
{% if not api_key_set %}
|
||||
|
||||
<div class="fixed z-10 overflow-y-auto top-0 w-full left-0 show" id="modal">
|
||||
<div class="flex items-center justify-center min-height-100vh pt-4 px-4 pb-20 text-center sm:block sm:p-0">
|
||||
<div class="fixed inset-0 transition-opacity">
|
||||
<div class="absolute inset-0 bg-gray-900 opacity-75" />
|
||||
</div>
|
||||
<span class="hidden sm:inline-block sm:align-middle sm:h-screen">​</span>
|
||||
<div class=" text-sm inline-block align-center bg-white rounded-lg text-left overflow-hidden shadow-xl transform transition-all sm:my-8 sm:align-middle sm:max-w-lg sm:w-full" role="dialog" aria-modal="true" aria-labelledby="modal-headline">
|
||||
<form id="api-key-form">
|
||||
<div class="bg-white px-4 pt-5 pb-4 sm:p-6 sm:pb-4">
|
||||
<h2>Before you can start using DocsGPT we need you to provide an API key for llm. Currently, we support only OpenAI but soon many more. You can find it <a class="text-blue-500 hover:text-blue-800" href="https://platform.openai.com/account/api-keys">here</a></h2><br>
|
||||
<label>OpenAI API key:</label>
|
||||
|
||||
<input id="api-key-input" type="password" class="w-full bg-gray-100 p-2 mt-2 mb-3" placeholder="Paste you Api Key here">
|
||||
|
||||
</div>
|
||||
<div class="bg-gray-200 px-4 py-3 text-right">
|
||||
<button type="submit" class="py-2 px-4 bg-blue-500 text-white rounded hover:bg-blue-700 mr-2">Save</button>
|
||||
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
|
||||
|
||||
<script>
|
||||
function docsIndex() {
|
||||
// loads latest index from https://raw.githubusercontent.com/arc53/DocsHUB/main/combined.json
|
||||
// and stores it in localStorage
|
||||
fetch('/api/combine')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
localStorage.setItem("docsIndex", JSON.stringify(data));
|
||||
localStorage.setItem("docsIndexDate", Date.now());
|
||||
generateOptions()
|
||||
}
|
||||
|
||||
)
|
||||
|
||||
}
|
||||
function generateOptions(){
|
||||
docsIndex = localStorage.getItem('docsIndex')
|
||||
// create option on select with id select-docs
|
||||
var select = document.getElementById("select-docs");
|
||||
// convert docsIndex to json
|
||||
docsIndex = JSON.parse(docsIndex)
|
||||
// create option for each key in docsIndex
|
||||
for (var key in docsIndex) {
|
||||
var option = document.createElement("option");
|
||||
if (docsIndex[key].location == 'docshub'){
|
||||
if (docsIndex[key].name == docsIndex[key].language) {
|
||||
option.text = docsIndex[key].name + " " + docsIndex[key].version;
|
||||
option.value = docsIndex[key].name + "/" + ".project" + "/" + docsIndex[key].version + "/{{ embeddings_choice }}/";
|
||||
if (docsIndex[key].model == "{{ embeddings_choice }}") {
|
||||
select.add(option);
|
||||
}
|
||||
}
|
||||
else {
|
||||
option.text = docsIndex[key].name + " " + docsIndex[key].version;
|
||||
option.value = docsIndex[key].language + "/" + docsIndex[key].name + "/" + docsIndex[key].version + "/{{ embeddings_choice }}/";
|
||||
if (docsIndex[key].model == "{{ embeddings_choice }}") {
|
||||
select.add(option);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
option.text = docsIndex[key].name;
|
||||
option.value = docsIndex[key].location + "/" + docsIndex[key].name;
|
||||
select.add(option);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
{% if not api_key_set %}
|
||||
if (localStorage.getItem('apiKey') === null) {
|
||||
console.log("apiKey is not set")
|
||||
document.getElementById('modal').classList.toggle('hidden')
|
||||
}
|
||||
{% endif %}
|
||||
if (localStorage.getItem('docsIndex') === null) {
|
||||
console.log("docsIndex is not set")
|
||||
docsIndex()
|
||||
}
|
||||
else if (localStorage.getItem("docsIndexDate") < Date.now() - 900000) {
|
||||
console.log("docsIndex is older than 15 minutes")
|
||||
docsIndex()
|
||||
}
|
||||
|
||||
generateOptions()
|
||||
|
||||
</script>
|
||||
{% if not api_key_set %}
|
||||
<script src="{{url_for('static',filename='src/authapi.js')}}"></script>
|
||||
{% endif %}
|
||||
<script src="{{url_for('static',filename='src/chat.js')}}"></script>
|
||||
<script src="{{url_for('static',filename='src/choiceChange.js')}}"></script>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
0
application/vectorstore/__init__.py
Normal file
51
application/vectorstore/base.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from abc import ABC, abstractmethod
|
||||
import os
|
||||
from langchain.embeddings import (
|
||||
OpenAIEmbeddings,
|
||||
HuggingFaceEmbeddings,
|
||||
CohereEmbeddings,
|
||||
HuggingFaceInstructEmbeddings,
|
||||
)
|
||||
from application.core.settings import settings
|
||||
|
||||
class BaseVectorStore(ABC):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def search(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def is_azure_configured(self):
|
||||
return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
|
||||
|
||||
def _get_embeddings(self, embeddings_name, embeddings_key=None):
|
||||
embeddings_factory = {
|
||||
"openai_text-embedding-ada-002": OpenAIEmbeddings,
|
||||
"huggingface_sentence-transformers/all-mpnet-base-v2": HuggingFaceEmbeddings,
|
||||
"huggingface_hkunlp/instructor-large": HuggingFaceInstructEmbeddings,
|
||||
"cohere_medium": CohereEmbeddings
|
||||
}
|
||||
|
||||
if embeddings_name not in embeddings_factory:
|
||||
raise ValueError(f"Invalid embeddings_name: {embeddings_name}")
|
||||
|
||||
if embeddings_name == "openai_text-embedding-ada-002":
|
||||
if self.is_azure_configured():
|
||||
os.environ["OPENAI_API_TYPE"] = "azure"
|
||||
embedding_instance = embeddings_factory[embeddings_name](
|
||||
model=settings.AZURE_EMBEDDINGS_DEPLOYMENT_NAME
|
||||
)
|
||||
else:
|
||||
embedding_instance = embeddings_factory[embeddings_name](
|
||||
openai_api_key=embeddings_key
|
||||
)
|
||||
elif embeddings_name == "cohere_medium":
|
||||
embedding_instance = embeddings_factory[embeddings_name](
|
||||
cohere_api_key=embeddings_key
|
||||
)
|
||||
else:
|
||||
embedding_instance = embeddings_factory[embeddings_name]()
|
||||
|
||||
return embedding_instance
|
||||
|
||||
221
application/vectorstore/elasticsearch.py
Normal file
@@ -0,0 +1,221 @@
|
||||
from application.vectorstore.base import BaseVectorStore
|
||||
from application.core.settings import settings
|
||||
import elasticsearch
|
||||
|
||||
class Document(str):
|
||||
"""Class for storing a piece of text and associated metadata."""
|
||||
|
||||
def __new__(cls, page_content: str, metadata: dict):
|
||||
instance = super().__new__(cls, page_content)
|
||||
instance.page_content = page_content
|
||||
instance.metadata = metadata
|
||||
return instance
|
||||
|
||||
|
||||
|
||||
|
||||
class ElasticsearchStore(BaseVectorStore):
|
||||
_es_connection = None # Class attribute to hold the Elasticsearch connection
|
||||
|
||||
def __init__(self, path, embeddings_key, index_name=settings.ELASTIC_INDEX):
|
||||
super().__init__()
|
||||
self.path = path.replace("application/indexes/", "").rstrip("/")
|
||||
self.embeddings_key = embeddings_key
|
||||
self.index_name = index_name
|
||||
|
||||
if ElasticsearchStore._es_connection is None:
|
||||
connection_params = {}
|
||||
if settings.ELASTIC_URL:
|
||||
connection_params["hosts"] = [settings.ELASTIC_URL]
|
||||
connection_params["http_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD)
|
||||
elif settings.ELASTIC_CLOUD_ID:
|
||||
connection_params["cloud_id"] = settings.ELASTIC_CLOUD_ID
|
||||
connection_params["basic_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD)
|
||||
else:
|
||||
raise ValueError("Please provide either elasticsearch_url or cloud_id.")
|
||||
|
||||
|
||||
|
||||
ElasticsearchStore._es_connection = elasticsearch.Elasticsearch(**connection_params)
|
||||
|
||||
self.docsearch = ElasticsearchStore._es_connection
|
||||
|
||||
def connect_to_elasticsearch(
|
||||
*,
|
||||
es_url = None,
|
||||
cloud_id = None,
|
||||
api_key = None,
|
||||
username = None,
|
||||
password = None,
|
||||
):
|
||||
try:
|
||||
import elasticsearch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import elasticsearch python package. "
|
||||
"Please install it with `pip install elasticsearch`."
|
||||
)
|
||||
|
||||
if es_url and cloud_id:
|
||||
raise ValueError(
|
||||
"Both es_url and cloud_id are defined. Please provide only one."
|
||||
)
|
||||
|
||||
connection_params = {}
|
||||
|
||||
if es_url:
|
||||
connection_params["hosts"] = [es_url]
|
||||
elif cloud_id:
|
||||
connection_params["cloud_id"] = cloud_id
|
||||
else:
|
||||
raise ValueError("Please provide either elasticsearch_url or cloud_id.")
|
||||
|
||||
if api_key:
|
||||
connection_params["api_key"] = api_key
|
||||
elif username and password:
|
||||
connection_params["basic_auth"] = (username, password)
|
||||
|
||||
es_client = elasticsearch.Elasticsearch(
|
||||
**connection_params,
|
||||
)
|
||||
try:
|
||||
es_client.info()
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
return es_client
|
||||
|
||||
def search(self, question, k=2, index_name=settings.ELASTIC_INDEX, *args, **kwargs):
|
||||
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
|
||||
vector = embeddings.embed_query(question)
|
||||
knn = {
|
||||
"filter": [{"match": {"metadata.store.keyword": self.path}}],
|
||||
"field": "vector",
|
||||
"k": k,
|
||||
"num_candidates": 100,
|
||||
"query_vector": vector,
|
||||
}
|
||||
full_query = {
|
||||
"knn": knn,
|
||||
"query": {
|
||||
"bool": {
|
||||
"must": [
|
||||
{
|
||||
"match": {
|
||||
"text": {
|
||||
"query": question,
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"filter": [{"match": {"metadata.store.keyword": self.path}}],
|
||||
}
|
||||
},
|
||||
"rank": {"rrf": {}},
|
||||
}
|
||||
resp = self.docsearch.search(index=self.index_name, query=full_query['query'], size=k, knn=full_query['knn'])
|
||||
# create Documnets objects from the results page_content ['_source']['text'], metadata ['_source']['metadata']
|
||||
doc_list = []
|
||||
for hit in resp['hits']['hits']:
|
||||
|
||||
doc_list.append(Document(page_content = hit['_source']['text'], metadata = hit['_source']['metadata']))
|
||||
return doc_list
|
||||
|
||||
def _create_index_if_not_exists(
|
||||
self, index_name, dims_length
|
||||
):
|
||||
|
||||
if self._es_connection.indices.exists(index=index_name):
|
||||
print(f"Index {index_name} already exists.")
|
||||
|
||||
else:
|
||||
|
||||
indexSettings = self.index(
|
||||
dims_length=dims_length,
|
||||
)
|
||||
self._es_connection.indices.create(index=index_name, **indexSettings)
|
||||
|
||||
def index(
|
||||
self,
|
||||
dims_length,
|
||||
):
|
||||
return {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"vector": {
|
||||
"type": "dense_vector",
|
||||
"dims": dims_length,
|
||||
"index": True,
|
||||
"similarity": "cosine",
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts,
|
||||
metadatas = None,
|
||||
ids = None,
|
||||
refresh_indices = True,
|
||||
create_index_if_not_exists = True,
|
||||
bulk_kwargs = None,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
from elasticsearch.helpers import BulkIndexError, bulk
|
||||
|
||||
bulk_kwargs = bulk_kwargs or {}
|
||||
import uuid
|
||||
embeddings = []
|
||||
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||
requests = []
|
||||
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
|
||||
|
||||
vectors = embeddings.embed_documents(list(texts))
|
||||
|
||||
dims_length = len(vectors[0])
|
||||
|
||||
if create_index_if_not_exists:
|
||||
self._create_index_if_not_exists(
|
||||
index_name=self.index_name, dims_length=dims_length
|
||||
)
|
||||
|
||||
for i, (text, vector) in enumerate(zip(texts, vectors)):
|
||||
metadata = metadatas[i] if metadatas else {}
|
||||
|
||||
requests.append(
|
||||
{
|
||||
"_op_type": "index",
|
||||
"_index": self.index_name,
|
||||
"text": text,
|
||||
"vector": vector,
|
||||
"metadata": metadata,
|
||||
"_id": ids[i],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
if len(requests) > 0:
|
||||
try:
|
||||
success, failed = bulk(
|
||||
self._es_connection,
|
||||
requests,
|
||||
stats_only=True,
|
||||
refresh=refresh_indices,
|
||||
**bulk_kwargs,
|
||||
)
|
||||
return ids
|
||||
except BulkIndexError as e:
|
||||
print(f"Error adding texts: {e}")
|
||||
firstError = e.errors[0].get("index", {}).get("error", {})
|
||||
print(f"First error reason: {firstError.get('reason')}")
|
||||
raise e
|
||||
|
||||
else:
|
||||
return []
|
||||
|
||||
def delete_index(self):
|
||||
self._es_connection.delete_by_query(index=self.index_name, query={"match": {
|
||||
"metadata.store.keyword": self.path}},)
|
||||
|
||||
26
application/vectorstore/faiss.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from application.vectorstore.base import BaseVectorStore
|
||||
from langchain import FAISS
|
||||
from application.core.settings import settings
|
||||
|
||||
class FaissStore(BaseVectorStore):
|
||||
|
||||
def __init__(self, path, embeddings_key, docs_init=None):
|
||||
super().__init__()
|
||||
self.path = path
|
||||
if docs_init:
|
||||
self.docsearch = FAISS.from_documents(
|
||||
docs_init, self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
|
||||
)
|
||||
else:
|
||||
self.docsearch = FAISS.load_local(
|
||||
self.path, self._get_embeddings(settings.EMBEDDINGS_NAME, settings.EMBEDDINGS_KEY)
|
||||
)
|
||||
|
||||
def search(self, *args, **kwargs):
|
||||
return self.docsearch.similarity_search(*args, **kwargs)
|
||||
|
||||
def add_texts(self, *args, **kwargs):
|
||||
return self.docsearch.add_texts(*args, **kwargs)
|
||||
|
||||
def save_local(self, *args, **kwargs):
|
||||
return self.docsearch.save_local(*args, **kwargs)
|
||||
16
application/vectorstore/vector_creator.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from application.vectorstore.faiss import FaissStore
|
||||
from application.vectorstore.elasticsearch import ElasticsearchStore
|
||||
|
||||
|
||||
class VectorCreator:
|
||||
vectorstores = {
|
||||
'faiss': FaissStore,
|
||||
'elasticsearch':ElasticsearchStore
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def create_vectorstore(cls, type, *args, **kwargs):
|
||||
vectorstore_class = cls.vectorstores.get(type.lower())
|
||||
if not vectorstore_class:
|
||||
raise ValueError(f"No vectorstore class found for type {type}")
|
||||
return vectorstore_class(*args, **kwargs)
|
||||
@@ -1,27 +1,35 @@
|
||||
import requests
|
||||
import nltk
|
||||
import os
|
||||
|
||||
from parser.file.bulk import SimpleDirectoryReader
|
||||
from parser.schema.base import Document
|
||||
from parser.open_ai_func import call_openai_api
|
||||
from parser.token_func import group_split
|
||||
from celery import current_task
|
||||
|
||||
|
||||
import shutil
|
||||
import string
|
||||
import zipfile
|
||||
import shutil
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import nltk
|
||||
import requests
|
||||
|
||||
from application.core.settings import settings
|
||||
from application.parser.file.bulk import SimpleDirectoryReader
|
||||
from application.parser.open_ai_func import call_openai_api
|
||||
from application.parser.schema.base import Document
|
||||
from application.parser.token_func import group_split
|
||||
|
||||
try:
|
||||
nltk.download('punkt', quiet=True)
|
||||
nltk.download('averaged_perceptron_tagger', quiet=True)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
|
||||
def metadata_from_filename(title):
|
||||
store = title.split('/')
|
||||
store = store[1] + '/' + store[2]
|
||||
return {'title': title, 'store': store}
|
||||
|
||||
|
||||
def generate_random_string(length):
|
||||
return ''.join([string.ascii_letters[i % 52] for i in range(length)])
|
||||
|
||||
|
||||
current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
# directory = 'inputs' or 'temp'
|
||||
@@ -38,13 +46,13 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
min_tokens = 150
|
||||
max_tokens = 1250
|
||||
full_path = directory + '/' + user + '/' + name_job
|
||||
import sys
|
||||
print(full_path, file=sys.stderr)
|
||||
# check if API_URL env variable is set
|
||||
if not os.environ.get('API_URL'):
|
||||
url = 'http://localhost:5001/api/download'
|
||||
else:
|
||||
url = os.environ.get('API_URL') + '/api/download'
|
||||
file_data = {'name': name_job, 'file': filename, 'user': user}
|
||||
response = requests.get(url, params=file_data)
|
||||
response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data)
|
||||
# check if file is in the response
|
||||
print(response, file=sys.stderr)
|
||||
file = response.content
|
||||
|
||||
if not os.path.exists(full_path):
|
||||
@@ -52,19 +60,17 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
with open(full_path + '/' + filename, 'wb') as f:
|
||||
f.write(file)
|
||||
|
||||
#check if file is .zip and extract it
|
||||
# check if file is .zip and extract it
|
||||
if filename.endswith('.zip'):
|
||||
with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref:
|
||||
zip_ref.extractall(full_path)
|
||||
os.remove(full_path + '/' + filename)
|
||||
|
||||
|
||||
import time
|
||||
self.update_state(state='PROGRESS', meta={'current': 1})
|
||||
|
||||
raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive,
|
||||
required_exts=formats, num_files_limit=limit,
|
||||
exclude_hidden=exclude).load_data()
|
||||
exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
|
||||
raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
|
||||
|
||||
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
|
||||
@@ -72,28 +78,30 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
call_openai_api(docs, full_path, self)
|
||||
self.update_state(state='PROGRESS', meta={'current': 100})
|
||||
|
||||
if sample == True:
|
||||
if sample:
|
||||
for i in range(min(5, len(raw_docs))):
|
||||
print(raw_docs[i].text)
|
||||
|
||||
# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
|
||||
# and send them to the server (provide user and name in form)
|
||||
if not os.environ.get('API_URL'):
|
||||
url = 'http://localhost:5001/api/upload_index'
|
||||
else:
|
||||
url = os.environ.get('API_URL') + '/api/upload_index'
|
||||
file_data = {'name': name_job, 'user': user}
|
||||
files = {'file_faiss': open(full_path + '/index.faiss', 'rb'),
|
||||
'file_pkl': open(full_path + '/index.pkl', 'rb')}
|
||||
response = requests.post(url, files=files, data=file_data)
|
||||
|
||||
#deletes remote
|
||||
if not os.environ.get('API_URL'):
|
||||
url = 'http://localhost:5001/api/delete_old?path=' + 'inputs/' + user + '/' + name_job
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
files = {'file_faiss': open(full_path + '/index.faiss', 'rb'),
|
||||
'file_pkl': open(full_path + '/index.pkl', 'rb')}
|
||||
response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
|
||||
response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
|
||||
else:
|
||||
url = os.environ.get('API_URL') + '/api/delete_old?path=' + 'inputs/' + user + '/' + name_job
|
||||
response = requests.get(url)
|
||||
response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
|
||||
|
||||
|
||||
# delete local
|
||||
shutil.rmtree(full_path)
|
||||
|
||||
return {'directory': directory, 'formats': formats, 'name_job': name_job, 'filename': filename, 'user': user}
|
||||
return {
|
||||
'directory': directory,
|
||||
'formats': formats,
|
||||
'name_job': name_job,
|
||||
'filename': filename,
|
||||
'user': user,
|
||||
'limited': False
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from app import app
|
||||
from application.app import app
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
app.run(debug=True, port=7091)
|
||||
|
||||
2
codecov.yml
Normal file
@@ -0,0 +1,2 @@
|
||||
ignore:
|
||||
- "*/tests/*”
|
||||
71
docker-compose-azure.yaml
Normal file
@@ -0,0 +1,71 @@
|
||||
version: "3.9"
|
||||
|
||||
services:
|
||||
frontend:
|
||||
build: ./frontend
|
||||
environment:
|
||||
- VITE_API_HOST=http://localhost:7091
|
||||
- VITE_API_STREAMING=$VITE_API_STREAMING
|
||||
ports:
|
||||
- "5173:5173"
|
||||
depends_on:
|
||||
- backend
|
||||
|
||||
backend:
|
||||
build: ./application
|
||||
environment:
|
||||
- API_KEY=$OPENAI_API_KEY
|
||||
- EMBEDDINGS_KEY=$OPENAI_API_KEY
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/1
|
||||
- MONGO_URI=mongodb://mongo:27017/docsgpt
|
||||
- OPENAI_API_KEY=$OPENAI_API_KEY
|
||||
- OPENAI_API_BASE=$OPENAI_API_BASE
|
||||
- OPENAI_API_VERSION=$OPENAI_API_VERSION
|
||||
- AZURE_DEPLOYMENT_NAME=$AZURE_DEPLOYMENT_NAME
|
||||
- AZURE_EMBEDDINGS_DEPLOYMENT_NAME=$AZURE_EMBEDDINGS_DEPLOYMENT_NAME
|
||||
ports:
|
||||
- "7091:7091"
|
||||
volumes:
|
||||
- ./application/indexes:/app/application/indexes
|
||||
- ./application/inputs:/app/application/inputs
|
||||
- ./application/vectors:/app/application/vectors
|
||||
depends_on:
|
||||
- redis
|
||||
- mongo
|
||||
|
||||
worker:
|
||||
build: ./application
|
||||
command: celery -A application.app.celery worker -l INFO
|
||||
environment:
|
||||
- API_KEY=$OPENAI_API_KEY
|
||||
- EMBEDDINGS_KEY=$OPENAI_API_KEY
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/1
|
||||
- MONGO_URI=mongodb://mongo:27017/docsgpt
|
||||
- API_URL=http://backend:7091
|
||||
- OPENAI_API_KEY=$OPENAI_API_KEY
|
||||
- OPENAI_API_BASE=$OPENAI_API_BASE
|
||||
- OPENAI_API_VERSION=$OPENAI_API_VERSION
|
||||
- AZURE_DEPLOYMENT_NAME=$AZURE_DEPLOYMENT_NAME
|
||||
- AZURE_EMBEDDINGS_DEPLOYMENT_NAME=$AZURE_EMBEDDINGS_DEPLOYMENT_NAME
|
||||
depends_on:
|
||||
- redis
|
||||
- mongo
|
||||
|
||||
redis:
|
||||
image: redis:6-alpine
|
||||
ports:
|
||||
- 6379:6379
|
||||
|
||||
mongo:
|
||||
image: mongo:6
|
||||
ports:
|
||||
- 27017:27017
|
||||
volumes:
|
||||
- mongodb_data_container:/data/db
|
||||
|
||||
|
||||
|
||||
volumes:
|
||||
mongodb_data_container:
|
||||
20
docker-compose-dev.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "3.9"
|
||||
|
||||
services:
|
||||
|
||||
redis:
|
||||
image: redis:6-alpine
|
||||
ports:
|
||||
- 6379:6379
|
||||
|
||||
mongo:
|
||||
image: mongo:6
|
||||
ports:
|
||||
- 27017:27017
|
||||
volumes:
|
||||
- mongodb_data_container:/data/db
|
||||
|
||||
|
||||
|
||||
volumes:
|
||||
mongodb_data_container:
|
||||
26
docker-compose-local.yaml
Normal file
@@ -0,0 +1,26 @@
|
||||
version: "3.9"
|
||||
|
||||
services:
|
||||
frontend:
|
||||
build: ./frontend
|
||||
environment:
|
||||
- VITE_API_HOST=http://localhost:7091
|
||||
- VITE_API_STREAMING=$VITE_API_STREAMING
|
||||
- VITE_EMBEDDINGS_NAME=$EMBEDDINGS_NAME
|
||||
ports:
|
||||
- "5173:5173"
|
||||
|
||||
redis:
|
||||
image: redis:6-alpine
|
||||
ports:
|
||||
- 6379:6379
|
||||
|
||||
mongo:
|
||||
image: mongo:6
|
||||
ports:
|
||||
- 27017:27017
|
||||
volumes:
|
||||
- mongodb_data_container:/data/db
|
||||
|
||||
volumes:
|
||||
mongodb_data_container:
|
||||
@@ -4,41 +4,45 @@ services:
|
||||
frontend:
|
||||
build: ./frontend
|
||||
environment:
|
||||
- VITE_API_HOST=http://localhost:5001
|
||||
- VITE_API_HOST=http://localhost:7091
|
||||
- VITE_API_STREAMING=$VITE_API_STREAMING
|
||||
ports:
|
||||
- "5173:5173"
|
||||
depends_on:
|
||||
- backend
|
||||
- backend
|
||||
|
||||
backend:
|
||||
build: ./application
|
||||
environment:
|
||||
- API_KEY=<your_api_key>
|
||||
- EMBEDDINGS_KEY=<your_api_key>
|
||||
- API_KEY=$OPENAI_API_KEY
|
||||
- EMBEDDINGS_KEY=$OPENAI_API_KEY
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/1
|
||||
- MONGO_URI=mongodb://mongo:27017/docsgpt
|
||||
- SELF_HOSTED_MODEL=$SELF_HOSTED_MODEL
|
||||
ports:
|
||||
- "5001:5001"
|
||||
- "7091:7091"
|
||||
volumes:
|
||||
- app_data_container:/app
|
||||
- ./application/indexes:/app/application/indexes
|
||||
- ./application/inputs:/app/application/inputs
|
||||
- ./application/vectors:/app/application/vectors
|
||||
depends_on:
|
||||
- redis
|
||||
- mongo
|
||||
- redis
|
||||
- mongo
|
||||
|
||||
worker:
|
||||
build: ./application
|
||||
command: celery -A app.celery worker -l INFO
|
||||
command: celery -A application.app.celery worker -l INFO
|
||||
environment:
|
||||
- API_KEY=<your_api_key>
|
||||
- EMBEDDINGS_KEY=<your_api_key>
|
||||
- API_KEY=$OPENAI_API_KEY
|
||||
- EMBEDDINGS_KEY=$OPENAI_API_KEY
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/1
|
||||
- MONGO_URI=mongodb://mongo:27017/docsgpt
|
||||
- API_URL=http://backend:5001
|
||||
- API_URL=http://backend:7091
|
||||
depends_on:
|
||||
- redis
|
||||
- mongo
|
||||
- redis
|
||||
- mongo
|
||||
|
||||
redis:
|
||||
image: redis:6-alpine
|
||||
@@ -52,8 +56,5 @@ services:
|
||||
volumes:
|
||||
- mongodb_data_container:/data/db
|
||||
|
||||
|
||||
|
||||
volumes:
|
||||
mongodb_data_container:
|
||||
app_data_container:
|
||||
1
docs/README.md
Normal file
@@ -0,0 +1 @@
|
||||
# nextra-docsgpt
|
||||
9
docs/next.config.js
Normal file
@@ -0,0 +1,9 @@
|
||||
const withNextra = require('nextra')({
|
||||
theme: 'nextra-theme-docs',
|
||||
themeConfig: './theme.config.jsx'
|
||||
})
|
||||
|
||||
module.exports = withNextra()
|
||||
|
||||
// If you have other Next.js configurations, you can pass them as the parameter:
|
||||
// module.exports = withNextra({ /* other next.js config */ })
|
||||
5975
docs/package-lock.json
generated
Normal file
11
docs/package.json
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"@vercel/analytics": "^1.0.2",
|
||||
"docsgpt": "^0.2.4",
|
||||
"next": "^13.4.19",
|
||||
"nextra": "^2.12.3",
|
||||
"nextra-theme-docs": "^2.12.3",
|
||||
"react": "^18.2.0",
|
||||
"react-dom": "^18.2.0"
|
||||
}
|
||||
}
|
||||
112
docs/pages/Deploying/Hosting-the-app.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# Self-hosting DocsGPT on Amazon Lightsail
|
||||
|
||||
Here's a step-by-step guide on how to setup an Amazon Lightsail instance to host DocsGPT.
|
||||
|
||||
## Configuring your instance
|
||||
|
||||
(If you know how to create a Lightsail instance, you can skip to the recommended configuration part by clicking here)
|
||||
|
||||
### 1. Create an account or login to https://lightsail.aws.amazon.com
|
||||
|
||||
### 2. Click on "Create instance"
|
||||
|
||||
### 3. Create your instance
|
||||
|
||||
The first step is to select the "Instance location". In most cases there's no need to switch locations as the default one will work well.
|
||||
|
||||
After that it is time to pick your Instance Image. We recommend using "Linux/Unix" as the image and "Ubuntu 20.04 LTS" for Operating System.
|
||||
|
||||
As for instance plan, it'll vary depending on your unique demands, but a "1 GB, 1vCPU, 40GB SSD and 2TB transfer" setup should cover most scenarios.
|
||||
|
||||
Lastly, Identify your instance by giving it a unique name and then hit "Create instance".
|
||||
|
||||
PS: Once you create your instance, it'll likely take a few minutes for the setup to be completed.
|
||||
|
||||
#### The recommended configuration is as follows:
|
||||
|
||||
- Ubuntu 20.04 LTS
|
||||
- 1GB RAM
|
||||
- 1vCPU
|
||||
- 40GB SSD Hard Drive
|
||||
- 2TB transfer
|
||||
|
||||
### Connecting to your the newly created instance
|
||||
|
||||
Your instance will be ready for use a few minutes after being created. To access, just open it up and click on "Connect using SSH".
|
||||
|
||||
#### Clone the repository
|
||||
|
||||
A terminal window will pop up, and the first step will be to clone DocsGPT git repository.
|
||||
|
||||
`git clone https://github.com/arc53/DocsGPT.git`
|
||||
|
||||
#### Download the package information
|
||||
|
||||
Once it has finished cloning the repository, it is time to download the package information from all sources. To do so simply enter the following command:
|
||||
|
||||
`sudo apt update`
|
||||
|
||||
#### Install Docker and Docker Compose
|
||||
|
||||
DocsGPT backend and worker use python, Frontend is written on React and the whole application is containerized using Docker. To install Docker and Docker Compose, enter the following commands:
|
||||
|
||||
`sudo apt install docker.io`
|
||||
|
||||
And now install docker-compose:
|
||||
|
||||
`sudo apt install docker-compose`
|
||||
|
||||
#### Access the DocsGPT folder
|
||||
|
||||
Enter the following command to access the folder in which DocsGPT docker-compose file is.
|
||||
|
||||
`cd DocsGPT/`
|
||||
|
||||
#### Prepare the environment
|
||||
|
||||
Inside the DocsGPT folder create a .env file and copy the contents of .env_sample into it.
|
||||
|
||||
`nano .env`
|
||||
|
||||
Make sure your .env file looks like this:
|
||||
|
||||
```
|
||||
OPENAI_API_KEY=(Your OpenAI API key)
|
||||
VITE_API_STREAMING=true
|
||||
SELF_HOSTED_MODEL=false
|
||||
```
|
||||
|
||||
To save the file, press CTRL+X, then Y and then ENTER.
|
||||
|
||||
Next we need to set a correct IP for our Backend. To do so, open the docker-compose.yml file:
|
||||
|
||||
`nano docker-compose.yml`
|
||||
|
||||
And change this line 7 `VITE_API_HOST=http://localhost:7091`
|
||||
to this `VITE_API_HOST=http://<your instance public IP>:7091`
|
||||
|
||||
This will allow the frontend to connect to the backend.
|
||||
|
||||
#### Running the app
|
||||
|
||||
You're almost there! Now that all the necessary bits and pieces have been installed, it is time to run the application. To do so, use the following command:
|
||||
|
||||
`sudo docker-compose up -d`
|
||||
|
||||
If you launch it for the first time it will take a few minutes to download all the necessary dependencies and build.
|
||||
|
||||
Once this is done you can go ahead and close the terminal window.
|
||||
|
||||
#### Enabling ports
|
||||
|
||||
Before you being able to access your live instance, you must first enable the port which it is using.
|
||||
|
||||
Open your Lightsail instance and head to "Networking".
|
||||
|
||||
Then click on "Add rule" under "IPv4 Firewall", enter 5173 as your your port and hit "Create".
|
||||
Repeat the process for port 7091.
|
||||
|
||||
#### Access your instance
|
||||
|
||||
Your instance will now be available under your Public IP Address and port 5173. Enjoy!
|
||||
|
||||
23
docs/pages/Deploying/Quickstart.md
Normal file
@@ -0,0 +1,23 @@
|
||||
## Launching Web App
|
||||
Note: Make sure you have docker installed
|
||||
|
||||
1. Open download this repository with `git clone https://github.com/arc53/DocsGPT.git`
|
||||
2. Create .env file in your root directory and set your `OPENAI_API_KEY` with your openai api key
|
||||
3. Run `docker-compose build && docker-compose up`
|
||||
4. Navigate to `http://localhost:5173/`
|
||||
|
||||
To stop just run Ctrl + C
|
||||
|
||||
### Chrome Extension
|
||||
|
||||
To install the Chrome extension:
|
||||
|
||||
1. In the DocsGPT GitHub repository, click on the "Code" button and select Download ZIP
|
||||
2. Unzip the downloaded file to a location you can easily access
|
||||
3. Open the Google Chrome browser and click on the three dots menu (upper right corner)
|
||||
4. Select "More Tools" and then "Extensions"
|
||||
5. Turn on the "Developer mode" switch in the top right corner of the Extensions page
|
||||
6. Click on the "Load unpacked" button
|
||||
7. Select the "Chrome" folder where the DocsGPT files have been unzipped (docsgpt-main > extensions > chrome)
|
||||
8. The extension should now be added to Google Chrome and can be managed on the Extensions page
|
||||
9. To disable or remove the extension, simply turn off the toggle switch on the extension card or click the "Remove" button.
|
||||
10
docs/pages/Deploying/_meta.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"Hosting-the-app": {
|
||||
"title": "☁️ Hosting DocsGPT",
|
||||
"href": "/Deploying/Hosting-the-app"
|
||||
},
|
||||
"Quickstart": {
|
||||
"title": "⚡️Quickstart",
|
||||
"href": "/Deploying/Quickstart"
|
||||
}
|
||||
}
|
||||
153
docs/pages/Developing/API-docs.md
Normal file
@@ -0,0 +1,153 @@
|
||||
App currently has two main api endpoints:
|
||||
|
||||
### /api/answer
|
||||
Its a POST request that sends a JSON in body with 4 values. Here is a JavaScript fetch example
|
||||
It will receive an answer for a user provided question
|
||||
|
||||
```js
|
||||
// answer (POST http://127.0.0.1:5000/api/answer)
|
||||
fetch("http://127.0.0.1:5000/api/answer", {
|
||||
"method": "POST",
|
||||
"headers": {
|
||||
"Content-Type": "application/json; charset=utf-8"
|
||||
},
|
||||
"body": JSON.stringify({"question":"Hi","history":null,"api_key":"OPENAI_API_KEY","embeddings_key":"OPENAI_API_KEY",
|
||||
"active_docs": "javascript/.project/ES2015/openai_text-embedding-ada-002/"})
|
||||
})
|
||||
.then((res) => res.text())
|
||||
.then(console.log.bind(console))
|
||||
```
|
||||
|
||||
In response you will get a json document like this one:
|
||||
|
||||
```json
|
||||
{
|
||||
"answer": " Hi there! How can I help you?\n",
|
||||
"query": "Hi",
|
||||
"result": " Hi there! How can I help you?\nSOURCES:"
|
||||
}
|
||||
```
|
||||
|
||||
### /api/docs_check
|
||||
It will make sure documentation is loaded on a server (just run it every time user is switching between libraries (documentations)
|
||||
Its a POST request that sends a JSON in body with 1 value. Here is a JavaScript fetch example
|
||||
|
||||
```js
|
||||
// answer (POST http://127.0.0.1:5000/api/docs_check)
|
||||
fetch("http://127.0.0.1:5000/api/docs_check", {
|
||||
"method": "POST",
|
||||
"headers": {
|
||||
"Content-Type": "application/json; charset=utf-8"
|
||||
},
|
||||
"body": JSON.stringify({"docs":"javascript/.project/ES2015/openai_text-embedding-ada-002/"})
|
||||
})
|
||||
.then((res) => res.text())
|
||||
.then(console.log.bind(console))
|
||||
```
|
||||
|
||||
In response you will get a json document like this one:
|
||||
```json
|
||||
{
|
||||
"status": "exists"
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
### /api/combine
|
||||
Provides json that tells UI which vectors are available and where they are located with a simple get request
|
||||
|
||||
Respsonse will include:
|
||||
date, description, docLink, fullName, language, location (local or docshub), model, name, version
|
||||
|
||||
Example of json in Docshub and local:
|
||||
<img width="295" alt="image" src="https://user-images.githubusercontent.com/15183589/224714085-f09f51a4-7a9a-4efb-bd39-798029bb4273.png">
|
||||
|
||||
|
||||
### /api/upload
|
||||
Uploads file that needs to be trained, response is json with task id, which can be used to check on tasks progress
|
||||
HTML example:
|
||||
|
||||
```html
|
||||
<form action="/api/upload" method="post" enctype="multipart/form-data" class="mt-2">
|
||||
<input type="file" name="file" class="py-4" id="file-upload">
|
||||
<input type="text" name="user" value="local" hidden>
|
||||
<input type="text" name="name" placeholder="Name:">
|
||||
|
||||
|
||||
<button type="submit" class="py-2 px-4 text-white bg-blue-500 rounded-md hover:bg-blue-600 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-blue-500">
|
||||
Upload
|
||||
</button>
|
||||
</form>
|
||||
```
|
||||
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"status": "ok",
|
||||
"task_id": "b2684988-9047-428b-bd47-08518679103c"
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
### /api/task_status
|
||||
Gets task status (task_id) from /api/upload
|
||||
```js
|
||||
// Task status (Get http://127.0.0.1:5000/api/task_status)
|
||||
fetch("http://localhost:5001/api/task_status?task_id=b2d2a0f4-387c-44fd-a443-e4fe2e7454d1", {
|
||||
"method": "GET",
|
||||
"headers": {
|
||||
"Content-Type": "application/json; charset=utf-8"
|
||||
},
|
||||
})
|
||||
.then((res) => res.text())
|
||||
.then(console.log.bind(console))
|
||||
```
|
||||
|
||||
Responses:
|
||||
There are two types of responses:
|
||||
1. while task it still running, where "current" will show progress from 0 - 100
|
||||
```json
|
||||
{
|
||||
"result": {
|
||||
"current": 1
|
||||
},
|
||||
"status": "PROGRESS"
|
||||
}
|
||||
```
|
||||
|
||||
2. When task is completed
|
||||
```json
|
||||
{
|
||||
"result": {
|
||||
"directory": "temp",
|
||||
"filename": "install.rst",
|
||||
"formats": [
|
||||
".rst",
|
||||
".md",
|
||||
".pdf"
|
||||
],
|
||||
"name_job": "somename",
|
||||
"user": "local"
|
||||
},
|
||||
"status": "SUCCESS"
|
||||
}
|
||||
```
|
||||
|
||||
### /api/delete_old
|
||||
deletes old vecotstores
|
||||
```js
|
||||
// Task status (GET http://127.0.0.1:5000/api/docs_check)
|
||||
fetch("http://localhost:5001/api/task_status?task_id=b2d2a0f4-387c-44fd-a443-e4fe2e7454d1", {
|
||||
"method": "GET",
|
||||
"headers": {
|
||||
"Content-Type": "application/json; charset=utf-8"
|
||||
},
|
||||
})
|
||||
.then((res) => res.text())
|
||||
.then(console.log.bind(console))
|
||||
```
|
||||
response:
|
||||
|
||||
```json
|
||||
{ "status": "ok" }
|
||||
```
|
||||
6
docs/pages/Developing/_meta.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"API-docs": {
|
||||
"title": "🗂️️ API-docs",
|
||||
"href": "/Developing/API-docs"
|
||||
}
|
||||
}
|
||||
29
docs/pages/Extensions/Chatwoot-extension.md
Normal file
@@ -0,0 +1,29 @@
|
||||
### To start chatwoot extension:
|
||||
1. Prepare and start the DocsGPT itself (load your documentation too)
|
||||
Follow our [wiki](https://github.com/arc53/DocsGPT/wiki) to start it and to [ingest](https://github.com/arc53/DocsGPT/wiki/How-to-train-on-other-documentation) data
|
||||
2. Go to chatwoot, Navigate to your profile (bottom left), click on profile settings, scroll to the bottom and copy Access Token
|
||||
2. Navigate to `/extensions/chatwoot`. Copy .env_sample and create .env file
|
||||
3. Fill in the values
|
||||
|
||||
```
|
||||
docsgpt_url=<docsgpt_api_url>
|
||||
chatwoot_url=<chatwoot_url>
|
||||
docsgpt_key=<openai_api_key or other llm key>
|
||||
chatwoot_token=<from part 2>
|
||||
```
|
||||
|
||||
4. start with `flask run` command
|
||||
|
||||
If you want for bot to stop responding to questions for a specific user or session just add label `human-requested` in your conversation
|
||||
|
||||
|
||||
### Optional (extra validation)
|
||||
In app.py uncomment lines 12-13 and 71-75
|
||||
|
||||
in your .env file add:
|
||||
|
||||
`account_id=(optional) 1 `
|
||||
|
||||
`assignee_id=(optional) 1`
|
||||
|
||||
Those are chatwoot values and will allow you to check if you are responding to correct widget and responding to questions assigned to specific user
|
||||
10
docs/pages/Extensions/_meta.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"Chatwoot-extension": {
|
||||
"title": "💬️ Chatwoot Extension",
|
||||
"href": "/Extensions/Chatwoot-extension"
|
||||
},
|
||||
"react-widget": {
|
||||
"title": "🏗️ Widget setup",
|
||||
"href": "/Extensions/react-widget"
|
||||
}
|
||||
}
|
||||
37
docs/pages/Extensions/react-widget.md
Normal file
@@ -0,0 +1,37 @@
|
||||
### How to set up react docsGPT widget on your website:
|
||||
|
||||
### Installation
|
||||
Got to your project and install a new dependency: `npm install docsgpt`
|
||||
|
||||
### Usage
|
||||
Go to your project and in the file where you want to use the widget import it:
|
||||
```js
|
||||
import { DocsGPTWidget } from "docsgpt";
|
||||
import "docsgpt/dist/style.css";
|
||||
```
|
||||
|
||||
|
||||
Then you can use it like this: `<DocsGPTWidget />`
|
||||
|
||||
DocsGPTWidget takes 3 props:
|
||||
- `apiHost` - url of your DocsGPT API
|
||||
- `selectDocs` - documentation that you want to use for your widget (eg. `default` or `local/docs1.zip`)
|
||||
- `apiKey` - usually its empty
|
||||
|
||||
### How to use DocsGPTWidget with [Nextra](https://nextra.site/) (Next.js + MDX)
|
||||
Install you widget as described above and then go to your `pages/` folder and create a new file `_app.js` with the following content:
|
||||
```js
|
||||
import { DocsGPTWidget } from "docsgpt";
|
||||
import "docsgpt/dist/style.css";
|
||||
|
||||
export default function MyApp({ Component, pageProps }) {
|
||||
return (
|
||||
<>
|
||||
<Component {...pageProps} />
|
||||
<DocsGPTWidget selectDocs="local/docsgpt-sep.zip/"/>
|
||||
</>
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
4
docs/pages/Guides/Customising-prompts.md
Normal file
@@ -0,0 +1,4 @@
|
||||
## To customise a main prompt navigate to `/application/prompt/combine_prompt.txt`
|
||||
|
||||
You can try editing it to see how the model responds.
|
||||
|
||||
60
docs/pages/Guides/How-to-train-on-other-documentation.md
Normal file
@@ -0,0 +1,60 @@
|
||||
## How to train on other documentation
|
||||
This AI can use any documentation, but first it needs to be prepared for similarity search.
|
||||
|
||||

|
||||
|
||||
Start by going to
|
||||
`/scripts/` folder
|
||||
|
||||
If you open this file you will see that it uses RST files from the folder to create a `index.faiss` and `index.pkl`.
|
||||
|
||||
It currently uses OPEN_AI to create vector store, so make sure your documentation is not too big. Pandas cost me around 3-4$
|
||||
|
||||
You can usually find documentation on github in docs/ folder for most open-source projects.
|
||||
|
||||
### 1. Find documentation in .rst/.md and create a folder with it in your scripts directory
|
||||
Name it `inputs/`
|
||||
Put all your .rst/.md files in there
|
||||
The search is recursive, so you don't need to flatten them
|
||||
|
||||
If there are no .rst/.md files just convert whatever you find to txt and feed it. (don't forget to change the extension in script)
|
||||
|
||||
### 2. Create .env file in `scripts/` folder
|
||||
And write your OpenAI API key inside
|
||||
`OPENAI_API_KEY=<your-api-key>`
|
||||
|
||||
### 3. Run scripts/ingest.py
|
||||
|
||||
`python ingest.py ingest`
|
||||
|
||||
It will tell you how much it will cost
|
||||
|
||||
### 4. Move `index.faiss` and `index.pkl` generated in `scripts/output` to `application/` folder.
|
||||
|
||||
|
||||
### 5. Run web app
|
||||
Once you run it will use new context that is relevant to your documentation
|
||||
Make sure you select default in the dropdown in the UI
|
||||
|
||||
## Customisation
|
||||
You can learn more about options while running ingest.py by running:
|
||||
|
||||
`python ingest.py --help`
|
||||
| Options | |
|
||||
|:--------------------------------:|:------------------------------------------------------------------------------------------------------------------------------:|
|
||||
| **ingest** | Runs 'ingest' function converting documentation to to Faiss plus Index format |
|
||||
| --dir TEXT | List of paths to directory for index creation. E.g. --dir inputs --dir inputs2 [default: inputs] |
|
||||
| --file TEXT | File paths to use (Optional; overrides directory) E.g. --files inputs/1.md --files inputs/2.md |
|
||||
| --recursive / --no-recursive | Whether to recursively search in subdirectories [default: recursive] |
|
||||
| --limit INTEGER | Maximum number of files to read |
|
||||
| --formats TEXT | List of required extensions (list with .) Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html [default: .rst, .md] |
|
||||
| --exclude / --no-exclude | Whether to exclude hidden files (dotfiles) [default: exclude] |
|
||||
| -y, --yes | Whether to skip price confirmation |
|
||||
| --sample / --no-sample | Whether to output sample of the first 5 split documents. [default: no-sample] |
|
||||
| --token-check / --no-token-check | Whether to group small documents and split large. Improves semantics. [default: token-check] |
|
||||
| --min_tokens INTEGER | Minimum number of tokens to not group. [default: 150] |
|
||||
| --max_tokens INTEGER | Maximum number of tokens to not split. [default: 2000] |
|
||||
| | |
|
||||
| **convert** | Creates documentation in .md format from source code |
|
||||
| --dir TEXT | Path to a directory with source code. E.g. --dir inputs [default: inputs] |
|
||||
| --formats TEXT | Source code language from which to create documentation. Supports py, js and java. E.g. --formats py [default: py] |
|
||||
32
docs/pages/Guides/How-to-use-different-LLM.md
Normal file
@@ -0,0 +1,32 @@
|
||||
Fortunately there are many providers for LLM's and some of them can even be ran locally
|
||||
|
||||
There are two models used in the app:
|
||||
1. Embeddings
|
||||
2. Text generation
|
||||
|
||||
By default we use OpenAI's models but if you want to change it or even run it locally, its very simple!
|
||||
|
||||
### Go to .env file or set environment variables:
|
||||
|
||||
`LLM_NAME=<your Text generation>`
|
||||
|
||||
`API_KEY=<api_key for Text generation>`
|
||||
|
||||
`EMBEDDINGS_NAME=<llm for embeddings>`
|
||||
|
||||
`EMBEDDINGS_KEY=<api_key for embeddings>`
|
||||
|
||||
`VITE_API_STREAMING=<true or false (true if using openai, false for all others)>`
|
||||
|
||||
You dont need to provide keys if you are happy with users providing theirs, so make sure you set LLM_NAME and EMBEDDINGS_NAME
|
||||
|
||||
Options:
|
||||
LLM_NAME (openai, manifest, cohere, Arc53/docsgpt-14b, Arc53/docsgpt-7b-falcon)
|
||||
EMBEDDINGS_NAME (openai_text-embedding-ada-002, huggingface_sentence-transformers/all-mpnet-base-v2, huggingface_hkunlp/instructor-large, cohere_medium)
|
||||
|
||||
That's it!
|
||||
|
||||
### Hosting everything locally and privately (for using our optimised open-source models)
|
||||
If you are working with important data and dont want anything to leave your premises.
|
||||
|
||||
Make sure you set SELF_HOSTED_MODEL as true in you .env variable and for your LLM_NAME you can use anything that's on Huggingface
|
||||
@@ -0,0 +1,19 @@
|
||||
If your AI uses external knowledge and is not explicit enough it is ok, because we try to make docsgpt friendly.
|
||||
|
||||
But if you want to adjust it, here is a simple way.
|
||||
|
||||
Got to `application/prompts/chat_combine_prompt.txt`
|
||||
|
||||
And change it to
|
||||
|
||||
|
||||
```
|
||||
|
||||
You are a DocsGPT, friendly and helpful AI assistant by Arc53 that provides help with documents. You give thorough answers with code examples, if possible.
|
||||
Write an answer for the question below based on the provided context.
|
||||
If the context provides insufficient information, reply "I cannot answer".
|
||||
You have access to chat history and can use it to help answer the question.
|
||||
----------------
|
||||
{summaries}
|
||||
|
||||
```
|
||||
18
docs/pages/Guides/_meta.json
Normal file
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"Customising-prompts": {
|
||||
"title": "🏗️️ Customising Prompts",
|
||||
"href": "/Guides/Customising-prompts"
|
||||
},
|
||||
"How-to-train-on-other-documentation": {
|
||||
"title": "📥 Training on docs",
|
||||
"href": "/Guides/How-to-train-on-other-documentation"
|
||||
},
|
||||
"How-to-use-different-LLM": {
|
||||
"title": "⚙️️ How to use different LLM's",
|
||||
"href": "/Guides/How-to-use-different-LLM"
|
||||
},
|
||||
"My-AI-answers-questions-using-external-knowledge": {
|
||||
"title": "💭️ Avoiding hallucinations",
|
||||
"href": "/Guides/My-AI-answers-questions-using-external-knowledge"
|
||||
}
|
||||
}
|
||||
11
docs/pages/_app.js
Normal file
@@ -0,0 +1,11 @@
|
||||
import { DocsGPTWidget } from "docsgpt";
|
||||
import "docsgpt/dist/style.css";
|
||||
|
||||
export default function MyApp({ Component, pageProps }) {
|
||||
return (
|
||||
<>
|
||||
<Component {...pageProps} />
|
||||
<DocsGPTWidget selectDocs="local/docsgpt-sep.zip/"/>
|
||||
</>
|
||||
)
|
||||
}
|
||||
37
docs/pages/index.mdx
Normal file
@@ -0,0 +1,37 @@
|
||||
---
|
||||
title: 'Home'
|
||||
---
|
||||
import { Cards, Card } from 'nextra/components'
|
||||
import deployingGuides from './Deploying/_meta.json';
|
||||
import developingGuides from './Developing/_meta.json';
|
||||
import extensionGuides from './Extensions/_meta.json';
|
||||
import mainGuides from './Guides/_meta.json';
|
||||
|
||||
|
||||
|
||||
|
||||
export const allGuides = {
|
||||
...mainGuides,
|
||||
...developingGuides,
|
||||
...deployingGuides,
|
||||
...extensionGuides,
|
||||
};
|
||||
|
||||
### **DocsGPT 🦖**
|
||||
|
||||
DocsGPT 🦖 is an innovative open-source tool designed to simplify the retrieval of information from project documentation using advanced GPT models 🤖. Eliminate lengthy manual searches 🔍 and enhance your documentation experience with DocsGPT, and consider contributing to its AI-powered future 🚀.
|
||||
|
||||
Our demo: [https://docsgpt.arc53.com/](https://docsgpt.arc53.com/)
|
||||
|
||||
Want to earn a cool shirt by submitting a **meaningful** PR, check out [Hacktoberfest](https://github.com/arc53/DocsGPT/blob/main/HACKTOBERFEST.md) guide.
|
||||
|
||||
<Cards
|
||||
num={3}
|
||||
children={Object.keys(allGuides).map((key, i) => (
|
||||
<Card
|
||||
key={i}
|
||||
title={allGuides[key].title}
|
||||
href={allGuides[key].href}
|
||||
/>
|
||||
))}
|
||||
/>
|
||||
BIN
docs/public/cute-docsgpt.png
Normal file
|
After Width: | Height: | Size: 191 KiB |
BIN
docs/public/favicons/apple-touch-icon.png
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
docs/public/favicons/favicon-16x16.png
Normal file
|
After Width: | Height: | Size: 1.1 KiB |