From 59328ea44d4063c764c8710ef97fc63573278aca Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 20:48:56 +0000 Subject: [PATCH 01/13] chore(deps): bump pandas from 2.2.0 to 2.2.2 in /application Bumps [pandas](https://github.com/pandas-dev/pandas) from 2.2.0 to 2.2.2. - [Release notes](https://github.com/pandas-dev/pandas/releases) - [Commits](https://github.com/pandas-dev/pandas/compare/v2.2.0...v2.2.2) --- updated-dependencies: - dependency-name: pandas dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index 21a4b883..01033549 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -17,7 +17,7 @@ javalang==0.13.0 langchain==0.1.4 langchain-openai==0.0.5 openapi3_parser==1.1.16 -pandas==2.2.0 +pandas==2.2.2 pydantic_settings==2.4.0 pymongo==4.8.0 PyPDF2==3.0.1 From 3886e41e941ce8cdd41fe62c34183c1c3d71a6b0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 28 Aug 2024 17:04:55 +0000 Subject: [PATCH 02/13] chore(deps-dev): bump braces from 3.0.2 to 3.0.3 in /extensions/chrome Bumps [braces](https://github.com/micromatch/braces) from 3.0.2 to 3.0.3. - [Changelog](https://github.com/micromatch/braces/blob/master/CHANGELOG.md) - [Commits](https://github.com/micromatch/braces/compare/3.0.2...3.0.3) --- updated-dependencies: - dependency-name: braces dependency-type: indirect ... Signed-off-by: dependabot[bot] --- extensions/chrome/package-lock.json | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/extensions/chrome/package-lock.json b/extensions/chrome/package-lock.json index 4f8145d6..9b16bfc4 100644 --- a/extensions/chrome/package-lock.json +++ b/extensions/chrome/package-lock.json @@ -107,12 +107,12 @@ } }, "node_modules/braces": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", - "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", "dev": true, "dependencies": { - "fill-range": "^7.0.1" + "fill-range": "^7.1.1" }, "engines": { "node": ">=8" @@ -260,9 +260,9 @@ } }, "node_modules/fill-range": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", - "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", "dev": true, "dependencies": { "to-regex-range": "^5.0.1" @@ -884,12 +884,12 @@ "dev": true }, "braces": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", - "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", "dev": true, "requires": { - "fill-range": "^7.0.1" + "fill-range": "^7.1.1" } }, "camelcase-css": { @@ -1000,9 +1000,9 @@ } }, "fill-range": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", - "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", "dev": true, "requires": { "to-regex-range": "^5.0.1" From 80df49478713d6df0b42d5441e6e5df712335e16 Mon Sep 17 00:00:00 2001 From: Aamer Aryan Date: Sat, 31 Aug 2024 04:37:56 +0530 Subject: [PATCH 03/13] fix: add .csv support to file upload input --- frontend/src/locale/en.json | 2 +- frontend/src/upload/Upload.tsx | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index d064c6fb..773768bd 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -71,7 +71,7 @@ "remote": "Remote", "name": "Name", "choose": "Choose Files", - "info": "Please upload .pdf, .txt, .rst, .docx, .md, .zip limited to 25mb", + "info": "Please upload .pdf, .txt, .rst, .csv, .docx, .md, .zip limited to 25mb", "uploadedFiles": "Uploaded Files", "cancel": "Cancel", "train": "Train", diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 5d5d1ac5..c5eed6d8 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -259,6 +259,7 @@ function Upload({ 'application/zip': ['.zip'], 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], + 'text/csv': ['.csv'], }, }); From 5c9e6404cc0ff8cd7253777ec8a5761122f0ef42 Mon Sep 17 00:00:00 2001 From: Aamer Aryan Date: Sat, 31 Aug 2024 04:45:25 +0530 Subject: [PATCH 04/13] fix: bump eslint-plugin-prettier from 4.2.1 to 5.2.1 Committer: Aamer Aryan --- frontend/package-lock.json | 65 +++++++++++++++---- frontend/package.json | 2 +- frontend/src/Navigation.tsx | 16 ++--- frontend/src/components/Dropdown.tsx | 32 ++++----- frontend/src/conversation/Conversation.tsx | 4 +- .../src/conversation/ConversationBubble.tsx | 10 +-- frontend/src/modals/DeleteConvModal.tsx | 14 ++-- frontend/src/preferences/APIKeyModal.tsx | 14 ++-- 8 files changed, 95 insertions(+), 62 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 92a2c132..0f7675fa 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -37,7 +37,7 @@ "eslint-config-standard-with-typescript": "^34.0.0", "eslint-plugin-import": "^2.27.5", "eslint-plugin-n": "^15.7.0", - "eslint-plugin-prettier": "^4.2.1", + "eslint-plugin-prettier": "^5.2.1", "eslint-plugin-promise": "^6.6.0", "eslint-plugin-react": "^7.35.0", "eslint-plugin-unused-imports": "^2.0.0", @@ -893,6 +893,18 @@ "node": ">= 8" } }, + "node_modules/@pkgr/core": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/@pkgr/core/-/core-0.1.1.tgz", + "integrity": "sha512-cq8o4cWH0ibXh9VGi5P20Tu9XF/0fFXl9EUinr9QfTM7a7p0oTA4iJRCQWppXR1Pg8dSM0UCItCkPwsk9qWWYA==", + "dev": true, + "engines": { + "node": "^12.20.0 || ^14.18.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/unts" + } + }, "node_modules/@reduxjs/toolkit": { "version": "1.9.2", "resolved": "https://registry.npmjs.org/@reduxjs/toolkit/-/toolkit-1.9.2.tgz", @@ -3469,21 +3481,30 @@ "dev": true }, "node_modules/eslint-plugin-prettier": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/eslint-plugin-prettier/-/eslint-plugin-prettier-4.2.1.tgz", - "integrity": "sha512-f/0rXLXUt0oFYs8ra4w49wYZBG5GKZpAYsJSm6rnYL5uVDjd+zowwMwVZHnAjf4edNrKpCDYfXDgmRE/Ak7QyQ==", + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/eslint-plugin-prettier/-/eslint-plugin-prettier-5.2.1.tgz", + "integrity": "sha512-gH3iR3g4JfF+yYPaJYkN7jEl9QbweL/YfkoRlNnuIEHEz1vHVlCmWOS+eGGiRuzHQXdJFCOTxRgvju9b8VUmrw==", "dev": true, "dependencies": { - "prettier-linter-helpers": "^1.0.0" + "prettier-linter-helpers": "^1.0.0", + "synckit": "^0.9.1" }, "engines": { - "node": ">=12.0.0" + "node": "^14.18.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/eslint-plugin-prettier" }, "peerDependencies": { - "eslint": ">=7.28.0", - "prettier": ">=2.0.0" + "@types/eslint": ">=8.0.0", + "eslint": ">=8.0.0", + "eslint-config-prettier": "*", + "prettier": ">=3.0.0" }, "peerDependenciesMeta": { + "@types/eslint": { + "optional": true + }, "eslint-config-prettier": { "optional": true } @@ -6434,9 +6455,9 @@ ] }, "node_modules/micromatch": { - "version": "4.0.7", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.7.tgz", - "integrity": "sha512-LPP/3KorzCwBxfeUuZmaR6bG2kdeHSbe0P2tY3FLRU4vYrjYz5hI4QZwV0njUx3jeuKe67YukQ1LSPZBKDqO/Q==", + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", + "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==", "dev": true, "dependencies": { "braces": "^3.0.3", @@ -8214,6 +8235,28 @@ "integrity": "sha512-e4hG1hRwoOdRb37cIMSgzNsxyzKfayW6VOflrwvR+/bzrkyxY/31WkbgnQpgtrNp1SdpJvpUAGTa/ZoiPNDuRQ==", "dev": true }, + "node_modules/synckit": { + "version": "0.9.1", + "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.9.1.tgz", + "integrity": "sha512-7gr8p9TQP6RAHusBOSLs46F4564ZrjV8xFmw5zCmgmhGUcw2hxsShhJ6CEiHQMgPDwAQ1fWHPM0ypc4RMAig4A==", + "dev": true, + "dependencies": { + "@pkgr/core": "^0.1.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": "^14.18.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/unts" + } + }, + "node_modules/synckit/node_modules/tslib": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz", + "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==", + "dev": true + }, "node_modules/tailwindcss": { "version": "3.2.4", "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.2.4.tgz", diff --git a/frontend/package.json b/frontend/package.json index bb8d2974..e45fbd36 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -48,7 +48,7 @@ "eslint-config-standard-with-typescript": "^34.0.0", "eslint-plugin-import": "^2.27.5", "eslint-plugin-n": "^15.7.0", - "eslint-plugin-prettier": "^4.2.1", + "eslint-plugin-prettier": "^5.2.1", "eslint-plugin-promise": "^6.6.0", "eslint-plugin-react": "^7.35.0", "eslint-plugin-unused-imports": "^2.0.0", diff --git a/frontend/src/Navigation.tsx b/frontend/src/Navigation.tsx index 56342ead..6514ba41 100644 --- a/frontend/src/Navigation.tsx +++ b/frontend/src/Navigation.tsx @@ -174,16 +174,12 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) { console.error(err); }); } - useOutsideAlerter( - navRef, - () => { - if (isMobile && navOpen && apiKeyModalState === 'INACTIVE') { - setNavOpen(false); - setIsDocsListOpen(false); - } - }, - [navOpen, isDocsListOpen, apiKeyModalState], - ); + useOutsideAlerter(navRef, () => { + if (isMobile && navOpen && apiKeyModalState === 'INACTIVE') { + setNavOpen(false); + setIsDocsListOpen(false); + } + }, [navOpen, isDocsListOpen, apiKeyModalState]); /* Needed to fix bug where if mobile nav was closed and then window was resized to desktop, nav would still be closed but the button to open would be gone, as per #1 on issue #146 diff --git a/frontend/src/components/Dropdown.tsx b/frontend/src/components/Dropdown.tsx index 17516aaa..adf17889 100644 --- a/frontend/src/components/Dropdown.tsx +++ b/frontend/src/components/Dropdown.tsx @@ -91,14 +91,14 @@ function Dropdown({ {selectedValue && 'label' in selectedValue ? selectedValue.label : selectedValue && 'description' in selectedValue - ? `${ - selectedValue.value < 1e9 - ? selectedValue.value + ` (${selectedValue.description})` - : selectedValue.description - }` - : placeholder - ? placeholder - : 'From URL'} + ? `${ + selectedValue.value < 1e9 + ? selectedValue.value + ` (${selectedValue.description})` + : selectedValue.description + }` + : placeholder + ? placeholder + : 'From URL'} )} {showEdit && onEdit && ( { + useEffect(() => { fetchStream.current && fetchStream.current.abort(); - },[conversationId]); + }, [conversationId]); useEffect(() => { const observerCallback: IntersectionObserverCallback = (entries) => { diff --git a/frontend/src/conversation/ConversationBubble.tsx b/frontend/src/conversation/ConversationBubble.tsx index a21e352d..7ec98820 100644 --- a/frontend/src/conversation/ConversationBubble.tsx +++ b/frontend/src/conversation/ConversationBubble.tsx @@ -235,10 +235,11 @@ const ConversationBubble = forwardRef< + > + {String(children).replace(/\n$/, '')} +
{ setIsSidebarOpen(state); }} - children={} - /> + > + + )}
); diff --git a/frontend/src/modals/DeleteConvModal.tsx b/frontend/src/modals/DeleteConvModal.tsx index c2fc5e38..43ec2a65 100644 --- a/frontend/src/modals/DeleteConvModal.tsx +++ b/frontend/src/modals/DeleteConvModal.tsx @@ -19,15 +19,11 @@ export default function DeleteConvModal({ const dispatch = useDispatch(); const { isMobile } = useMediaQuery(); const { t } = useTranslation(); - useOutsideAlerter( - modalRef, - () => { - if (isMobile && modalState === 'ACTIVE') { - dispatch(setModalState('INACTIVE')); - } - }, - [modalState], - ); + useOutsideAlerter(modalRef, () => { + if (isMobile && modalState === 'ACTIVE') { + dispatch(setModalState('INACTIVE')); + } + }, [modalState]); function handleSubmit() { handleDeleteAllConv(); diff --git a/frontend/src/preferences/APIKeyModal.tsx b/frontend/src/preferences/APIKeyModal.tsx index 166bd62b..43698fe1 100644 --- a/frontend/src/preferences/APIKeyModal.tsx +++ b/frontend/src/preferences/APIKeyModal.tsx @@ -22,15 +22,11 @@ export default function APIKeyModal({ const modalRef = useRef(null); const { isMobile } = useMediaQuery(); - useOutsideAlerter( - modalRef, - () => { - if (isMobile && modalState === 'ACTIVE') { - setModalState('INACTIVE'); - } - }, - [modalState], - ); + useOutsideAlerter(modalRef, () => { + if (isMobile && modalState === 'ACTIVE') { + setModalState('INACTIVE'); + } + }, [modalState]); function handleSubmit() { if (key.length <= 1) { From 4f88b6dc71148a51928ca9975ada75fc315ac900 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 31 Aug 2024 12:30:03 +0100 Subject: [PATCH 05/13] feat: logging --- application/api/answer/routes.py | 25 +++++++++++++++++++------ application/app.py | 2 ++ application/celery_init.py | 6 ++++++ application/core/logging_config.py | 22 ++++++++++++++++++++++ application/worker.py | 12 ++++++------ 5 files changed, 55 insertions(+), 12 deletions(-) create mode 100644 application/core/logging_config.py diff --git a/application/api/answer/routes.py b/application/api/answer/routes.py index 893edd3a..a809b4ef 100644 --- a/application/api/answer/routes.py +++ b/application/api/answer/routes.py @@ -1,7 +1,7 @@ import asyncio import os import sys -from flask import Blueprint, request, Response +from flask import Blueprint, request, Response, current_app import json import datetime import logging @@ -267,6 +267,10 @@ def stream(): else: retriever_name = source["active_docs"] + current_app.logger.info(f"/stream - request_data: {data}, source: {source}", + extra={"data": json.dumps({"request_data": data, "source": source})} + ) + prompt = get_prompt(prompt_id) retriever = RetrieverCreator.create_retriever( @@ -301,7 +305,9 @@ def stream(): mimetype="text/event-stream", ) except Exception as e: - print("\033[91merr", str(e), file=sys.stderr) + current_app.logger.error(f"/stream - error: {str(e)} - traceback: {traceback.format_exc()}", + extra={"error": str(e), "traceback": traceback.format_exc()} + ) message = e.args[0] status_code = 400 # # Custom exceptions with two arguments, index 1 as status code @@ -345,7 +351,6 @@ def api_answer(): else: token_limit = settings.DEFAULT_MAX_HISTORY - # use try and except to check for exception try: # check if the vectorstore is set if "api_key" in data: @@ -365,6 +370,10 @@ def api_answer(): prompt = get_prompt(prompt_id) + current_app.logger.info(f"/api/answer - request_data: {data}, source: {source}", + extra={"data": json.dumps({"request_data": data, "source": source})} + ) + retriever = RetrieverCreator.create_retriever( retriever_name, question=question, @@ -399,9 +408,9 @@ def api_answer(): return result except Exception as e: - # print whole traceback - traceback.print_exc() - print(str(e)) + current_app.logger.error(f"/api/answer - error: {str(e)} - traceback: {traceback.format_exc()}", + extra={"error": str(e), "traceback": traceback.format_exc()} + ) return bad_request(500, str(e)) @@ -433,6 +442,10 @@ def api_search(): token_limit = data["token_limit"] else: token_limit = settings.DEFAULT_MAX_HISTORY + + current_app.logger.info(f"/api/answer - request_data: {data}, source: {source}", + extra={"data": json.dumps({"request_data": data, "source": source})} + ) retriever = RetrieverCreator.create_retriever( retriever_name, diff --git a/application/app.py b/application/app.py index fe8efd12..87d9d42f 100644 --- a/application/app.py +++ b/application/app.py @@ -6,12 +6,14 @@ from application.core.settings import settings from application.api.user.routes import user from application.api.answer.routes import answer from application.api.internal.routes import internal +from application.core.logging_config import setup_logging if platform.system() == "Windows": import pathlib pathlib.PosixPath = pathlib.WindowsPath dotenv.load_dotenv() +setup_logging() app = Flask(__name__) app.register_blueprint(user) diff --git a/application/celery_init.py b/application/celery_init.py index c19c2e75..c5838083 100644 --- a/application/celery_init.py +++ b/application/celery_init.py @@ -1,9 +1,15 @@ from celery import Celery from application.core.settings import settings +from celery.signals import setup_logging def make_celery(app_name=__name__): celery = Celery(app_name, broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND) celery.conf.update(settings) return celery +@setup_logging.connect +def config_loggers(*args, **kwargs): + from application.core.logging_config import setup_logging + setup_logging() + celery = make_celery() diff --git a/application/core/logging_config.py b/application/core/logging_config.py new file mode 100644 index 00000000..e693cb91 --- /dev/null +++ b/application/core/logging_config.py @@ -0,0 +1,22 @@ +from logging.config import dictConfig + +def setup_logging(): + dictConfig({ + 'version': 1, + 'formatters': { + 'default': { + 'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s', + } + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "stream": "ext://sys.stdout", + "formatter": "default", + } + }, + 'root': { + 'level': 'INFO', + 'handlers': ['console'], + }, + }) \ No newline at end of file diff --git a/application/worker.py b/application/worker.py index bd1bc15a..3105aabe 100755 --- a/application/worker.py +++ b/application/worker.py @@ -4,6 +4,7 @@ import string import zipfile import tiktoken from urllib.parse import urljoin +import logging import requests @@ -14,6 +15,7 @@ from application.parser.open_ai_func import call_openai_api from application.parser.schema.base import Document from application.parser.token_func import group_split + # Define a function to extract metadata from a given filename. def metadata_from_filename(title): store = "/".join(title.split("/")[1:3]) @@ -41,7 +43,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5): max_depth (int): Maximum allowed depth of recursion to prevent infinite loops. """ if current_depth > max_depth: - print(f"Reached maximum recursion depth of {max_depth}") + logging.warning(f"Reached maximum recursion depth of {max_depth}") return with zipfile.ZipFile(zip_path, "r") as zip_ref: @@ -88,16 +90,13 @@ def ingest_worker(self, directory, formats, name_job, filename, user): max_tokens = 1250 recursion_depth = 2 full_path = os.path.join(directory, user, name_job) - import sys - print(full_path, file=sys.stderr) + logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job}) # check if API_URL env variable is set file_data = {"name": name_job, "file": filename, "user": user} response = requests.get( urljoin(settings.API_URL, "/api/download"), params=file_data ) - # check if file is in the response - print(response, file=sys.stderr) file = response.content if not os.path.exists(full_path): @@ -137,7 +136,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): if sample: for i in range(min(5, len(raw_docs))): - print(raw_docs[i].text) + logging.info(f"Sample document {i}: {raw_docs[i]}") # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl # and send them to the server (provide user and name in form) @@ -180,6 +179,7 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"): if not os.path.exists(full_path): os.makedirs(full_path) self.update_state(state="PROGRESS", meta={"current": 1}) + logging.info(f"Remote job: {full_path}", extra={"user": user, "job": name_job, source_data: source_data}) remote_loader = RemoteCreator.create_loader(loader) raw_docs = remote_loader.load_data(source_data) From c49b7613e03174a56cb5da76f643bd04dada627e Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 31 Aug 2024 12:53:37 +0100 Subject: [PATCH 06/13] fix: langchain warning --- application/parser/remote/crawler_loader.py | 2 +- application/parser/remote/sitemap_loader.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py index 2a63f284..76325ae6 100644 --- a/application/parser/remote/crawler_loader.py +++ b/application/parser/remote/crawler_loader.py @@ -5,7 +5,7 @@ from application.parser.remote.base import BaseRemote class CrawlerLoader(BaseRemote): def __init__(self, limit=10): - from langchain.document_loaders import WebBaseLoader + from langchain_community.document_loaders import WebBaseLoader self.loader = WebBaseLoader # Initialize the document loader self.limit = limit # Set the limit for the number of pages to scrape diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py index 6e9182c4..8066f4f6 100644 --- a/application/parser/remote/sitemap_loader.py +++ b/application/parser/remote/sitemap_loader.py @@ -5,7 +5,7 @@ from application.parser.remote.base import BaseRemote class SitemapLoader(BaseRemote): def __init__(self, limit=20): - from langchain.document_loaders import WebBaseLoader + from langchain_community.document_loaders import WebBaseLoader self.loader = WebBaseLoader self.limit = limit # Adding limit to control the number of URLs to process From d9309ebc6eaee86bc5bc0f41a83d6e67f12990a6 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 31 Aug 2024 17:07:40 +0100 Subject: [PATCH 07/13] feat: better token counter --- application/retriever/brave_search.py | 4 ++-- application/retriever/classic_rag.py | 4 ++-- application/retriever/duckduck_search.py | 4 ++-- application/usage.py | 10 ++++----- application/utils.py | 26 +++++++++++++++++++----- application/worker.py | 26 ++---------------------- 6 files changed, 34 insertions(+), 40 deletions(-) diff --git a/application/retriever/brave_search.py b/application/retriever/brave_search.py index 70dbbf20..5d1e1566 100644 --- a/application/retriever/brave_search.py +++ b/application/retriever/brave_search.py @@ -2,7 +2,7 @@ import json from application.retriever.base import BaseRetriever from application.core.settings import settings from application.llm.llm_creator import LLMCreator -from application.utils import count_tokens +from application.utils import num_tokens_from_string from langchain_community.tools import BraveSearch @@ -78,7 +78,7 @@ class BraveRetSearch(BaseRetriever): self.chat_history.reverse() for i in self.chat_history: if "prompt" in i and "response" in i: - tokens_batch = count_tokens(i["prompt"]) + count_tokens( + tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string( i["response"] ) if tokens_current_history + tokens_batch < self.token_limit: diff --git a/application/retriever/classic_rag.py b/application/retriever/classic_rag.py index 2b77db34..aef6e503 100644 --- a/application/retriever/classic_rag.py +++ b/application/retriever/classic_rag.py @@ -4,7 +4,7 @@ from application.core.settings import settings from application.vectorstore.vector_creator import VectorCreator from application.llm.llm_creator import LLMCreator -from application.utils import count_tokens +from application.utils import num_tokens_from_string class ClassicRAG(BaseRetriever): @@ -98,7 +98,7 @@ class ClassicRAG(BaseRetriever): self.chat_history.reverse() for i in self.chat_history: if "prompt" in i and "response" in i: - tokens_batch = count_tokens(i["prompt"]) + count_tokens( + tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string( i["response"] ) if tokens_current_history + tokens_batch < self.token_limit: diff --git a/application/retriever/duckduck_search.py b/application/retriever/duckduck_search.py index bee74e24..6d2965f5 100644 --- a/application/retriever/duckduck_search.py +++ b/application/retriever/duckduck_search.py @@ -1,7 +1,7 @@ from application.retriever.base import BaseRetriever from application.core.settings import settings from application.llm.llm_creator import LLMCreator -from application.utils import count_tokens +from application.utils import num_tokens_from_string from langchain_community.tools import DuckDuckGoSearchResults from langchain_community.utilities import DuckDuckGoSearchAPIWrapper @@ -95,7 +95,7 @@ class DuckDuckSearch(BaseRetriever): self.chat_history.reverse() for i in self.chat_history: if "prompt" in i and "response" in i: - tokens_batch = count_tokens(i["prompt"]) + count_tokens( + tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string( i["response"] ) if tokens_current_history + tokens_batch < self.token_limit: diff --git a/application/usage.py b/application/usage.py index 1b26e9d7..aba0ec77 100644 --- a/application/usage.py +++ b/application/usage.py @@ -2,7 +2,7 @@ import sys from pymongo import MongoClient from datetime import datetime from application.core.settings import settings -from application.utils import count_tokens +from application.utils import num_tokens_from_string mongo = MongoClient(settings.MONGO_URI) db = mongo["docsgpt"] @@ -24,9 +24,9 @@ def update_token_usage(user_api_key, token_usage): def gen_token_usage(func): def wrapper(self, model, messages, stream, **kwargs): for message in messages: - self.token_usage["prompt_tokens"] += count_tokens(message["content"]) + self.token_usage["prompt_tokens"] += num_tokens_from_string(message["content"]) result = func(self, model, messages, stream, **kwargs) - self.token_usage["generated_tokens"] += count_tokens(result) + self.token_usage["generated_tokens"] += num_tokens_from_string(result) update_token_usage(self.user_api_key, self.token_usage) return result @@ -36,14 +36,14 @@ def gen_token_usage(func): def stream_token_usage(func): def wrapper(self, model, messages, stream, **kwargs): for message in messages: - self.token_usage["prompt_tokens"] += count_tokens(message["content"]) + self.token_usage["prompt_tokens"] += num_tokens_from_string(message["content"]) batch = [] result = func(self, model, messages, stream, **kwargs) for r in result: batch.append(r) yield r for line in batch: - self.token_usage["generated_tokens"] += count_tokens(line) + self.token_usage["generated_tokens"] += num_tokens_from_string(line) update_token_usage(self.user_api_key, self.token_usage) return wrapper diff --git a/application/utils.py b/application/utils.py index 3d9bf520..70a00ce0 100644 --- a/application/utils.py +++ b/application/utils.py @@ -1,6 +1,22 @@ -from transformers import GPT2TokenizerFast +import tiktoken -tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') -tokenizer.model_max_length = 100000 -def count_tokens(string): - return len(tokenizer(string)['input_ids']) \ No newline at end of file +_encoding = None + +def get_encoding(): + global _encoding + if _encoding is None: + _encoding = tiktoken.get_encoding("cl100k_base") + return _encoding + +def num_tokens_from_string(string: str) -> int: + encoding = get_encoding() + num_tokens = len(encoding.encode(string)) + return num_tokens + +def count_tokens_docs(docs): + docs_content = "" + for doc in docs: + docs_content += doc.page_content + + tokens = num_tokens_from_string(docs_content) + return tokens \ No newline at end of file diff --git a/application/worker.py b/application/worker.py index 3105aabe..c315f916 100755 --- a/application/worker.py +++ b/application/worker.py @@ -2,7 +2,6 @@ import os import shutil import string import zipfile -import tiktoken from urllib.parse import urljoin import logging @@ -14,6 +13,7 @@ from application.parser.remote.remote_creator import RemoteCreator from application.parser.open_ai_func import call_openai_api from application.parser.schema.base import Document from application.parser.token_func import group_split +from application.utils import count_tokens_docs # Define a function to extract metadata from a given filename. @@ -212,26 +212,4 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"): shutil.rmtree(full_path) - return {"urls": source_data, "name_job": name_job, "user": user, "limited": False} - - -def count_tokens_docs(docs): - # Here we convert the docs list to a string and calculate the number of tokens the string represents. - # docs_content = (" ".join(docs)) - docs_content = "" - for doc in docs: - docs_content += doc.page_content - - tokens, total_price = num_tokens_from_string( - string=docs_content, encoding_name="cl100k_base" - ) - # Here we print the number of tokens and the approx user cost with some visually appealing formatting. - return tokens - - -def num_tokens_from_string(string: str, encoding_name: str) -> int: - # Function to convert string to tokens and estimate user cost. - encoding = tiktoken.get_encoding(encoding_name) - num_tokens = len(encoding.encode(string)) - total_price = (num_tokens / 1000) * 0.0004 - return num_tokens, total_price \ No newline at end of file + return {"urls": source_data, "name_job": name_job, "user": user, "limited": False} \ No newline at end of file From 0aca41f9a69a67ee33184a1759062bf0798537cf Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 1 Sep 2024 11:09:24 +0100 Subject: [PATCH 08/13] fix: faiss dependency --- application/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index 01033549..03859dc6 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -9,13 +9,14 @@ EbookLib==0.18 elasticsearch==8.14.0 escodegen==1.0.11 esprima==4.0.1 -faiss-cpu==1.7.4 Flask==3.0.1 +faiss-cpu==1.8.0 gunicorn==23.0.0 html2text==2020.1.16 javalang==0.13.0 langchain==0.1.4 langchain-openai==0.0.5 +logtail-python openapi3_parser==1.1.16 pandas==2.2.2 pydantic_settings==2.4.0 From b630be8c8af5b0a12fedd3dc3730b17b2e2e872a Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 1 Sep 2024 11:10:54 +0100 Subject: [PATCH 09/13] fix: remove logtail --- application/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index 03859dc6..b793934b 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -16,7 +16,6 @@ html2text==2020.1.16 javalang==0.13.0 langchain==0.1.4 langchain-openai==0.0.5 -logtail-python openapi3_parser==1.1.16 pandas==2.2.2 pydantic_settings==2.4.0 From 9526ed02584b9946e4c3b9e33dc29e4268a68232 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 2 Sep 2024 19:46:25 +0100 Subject: [PATCH 10/13] feat: added easy way to proxy --- application/core/settings.py | 1 + application/llm/openai.py | 28 +++++++++++----------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/application/core/settings.py b/application/core/settings.py index 6ae5475c..bbd62fe4 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -29,6 +29,7 @@ class Settings(BaseSettings): OPENAI_API_VERSION: Optional[str] = None # azure openai api version AZURE_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for answering AZURE_EMBEDDINGS_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for embeddings + OPENAI_BASE_URL: Optional[str] = None # openai base url for open ai compatable models # elasticsearch ELASTIC_CLOUD_ID: Optional[str] = None # cloud id for elasticsearch diff --git a/application/llm/openai.py b/application/llm/openai.py index b1574dd1..73a0c3d1 100644 --- a/application/llm/openai.py +++ b/application/llm/openai.py @@ -1,26 +1,25 @@ from application.llm.base import BaseLLM from application.core.settings import settings +import logging + class OpenAILLM(BaseLLM): def __init__(self, api_key=None, user_api_key=None, *args, **kwargs): - global openai from openai import OpenAI super().__init__(*args, **kwargs) - self.client = OpenAI( - api_key=api_key, - ) + if settings.OPENAI_BASE_URL: + self.client = OpenAI( + api_key=api_key, + base_url=settings.OPENAI_BASE_URL + ) + else: + self.client = OpenAI(api_key=api_key) self.api_key = api_key self.user_api_key = user_api_key - def _get_openai(self): - # Import openai when needed - import openai - - return openai - def _raw_gen( self, baseself, @@ -29,7 +28,7 @@ class OpenAILLM(BaseLLM): stream=False, engine=settings.AZURE_DEPLOYMENT_NAME, **kwargs - ): + ): response = self.client.chat.completions.create( model=model, messages=messages, stream=stream, **kwargs ) @@ -44,7 +43,7 @@ class OpenAILLM(BaseLLM): stream=True, engine=settings.AZURE_DEPLOYMENT_NAME, **kwargs - ): + ): response = self.client.chat.completions.create( model=model, messages=messages, stream=stream, **kwargs ) @@ -73,8 +72,3 @@ class AzureOpenAILLM(OpenAILLM): api_base=settings.OPENAI_API_BASE, deployment_name=settings.AZURE_DEPLOYMENT_NAME, ) - - def _get_openai(self): - openai = super()._get_openai() - - return openai From 5246d85f118b7a704853bcf9907755bd7ed2f53c Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 2 Sep 2024 20:00:24 +0100 Subject: [PATCH 11/13] fix: ruff --- application/llm/openai.py | 1 - 1 file changed, 1 deletion(-) diff --git a/application/llm/openai.py b/application/llm/openai.py index 73a0c3d1..f85de6ea 100644 --- a/application/llm/openai.py +++ b/application/llm/openai.py @@ -1,6 +1,5 @@ from application.llm.base import BaseLLM from application.core.settings import settings -import logging From 2de1e5f71a1deb1962b7b6aabf0453fbb1c6020b Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 2 Sep 2024 20:09:16 +0100 Subject: [PATCH 12/13] chore: open ai compatable data --- docs/pages/Guides/How-to-use-different-LLM.mdx | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/pages/Guides/How-to-use-different-LLM.mdx b/docs/pages/Guides/How-to-use-different-LLM.mdx index 908ddb28..274660be 100644 --- a/docs/pages/Guides/How-to-use-different-LLM.mdx +++ b/docs/pages/Guides/How-to-use-different-LLM.mdx @@ -36,6 +36,15 @@ List of latest supported LLMs are https://github.com/arc53/DocsGPT/blob/main/app Visit application/llm and select the file of your selected llm and there you will find the speicifc requirements needed to be filled in order to use it,i.e API key of that llm. +### For OpenAI-Compatible Endpoints: +DocsGPT supports the use of OpenAI-compatible endpoints through base URL substitution. This feature allows you to use alternative AI models or services that implement the OpenAI API interface. + + +Set the OPENAI_BASE_URL in your environment. You can change .env file with OPENAI_BASE_URL with the desired base URL or docker-compose.yml file and add the environment variable to the backend container. + +> [!Note] +> Make sure you have the right API_KEY and correct LLM_NAME. + From 817fc5d4b3448417c12e2e8ed78375b41314e873 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 2 Sep 2024 20:11:31 +0100 Subject: [PATCH 13/13] fix: little nextra edit --- docs/pages/Guides/How-to-use-different-LLM.mdx | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/pages/Guides/How-to-use-different-LLM.mdx b/docs/pages/Guides/How-to-use-different-LLM.mdx index 274660be..7df77742 100644 --- a/docs/pages/Guides/How-to-use-different-LLM.mdx +++ b/docs/pages/Guides/How-to-use-different-LLM.mdx @@ -42,7 +42,6 @@ DocsGPT supports the use of OpenAI-compatible endpoints through base URL substit Set the OPENAI_BASE_URL in your environment. You can change .env file with OPENAI_BASE_URL with the desired base URL or docker-compose.yml file and add the environment variable to the backend container. -> [!Note] > Make sure you have the right API_KEY and correct LLM_NAME.