From 98a97f34f51e363a3bca5fc108a310faf1249462 Mon Sep 17 00:00:00 2001 From: Anton Larin Date: Sun, 13 Aug 2023 19:25:55 +0200 Subject: [PATCH 1/2] fix packaging and imports and introduce tests with pytest. still issues with celery worker. --- .github/workflows/pytest.yml | 28 ++++++++++++++++ application/__init__.py | 0 application/app.py | 6 ++-- application/parser/file/__init__.py | 1 + application/parser/file/base.py | 2 +- application/parser/file/bulk.py | 18 +++++------ application/parser/file/docs_parser.py | 2 +- application/parser/file/epub_parser.py | 2 +- application/parser/file/html_parser.py | 2 +- application/parser/file/markdown_parser.py | 2 +- application/parser/file/rst_parser.py | 2 +- application/parser/file/tabular_parser.py | 2 +- application/parser/schema/__init__.py | 1 + application/parser/schema/base.py | 2 +- application/parser/token_func.py | 2 +- application/requirements.txt | 1 + application/tests/__init__.py | 0 application/tests/test_app.py | 37 ++++++++++++++++++++++ application/worker.py | 10 +++--- application/wsgi.py | 2 +- docker-compose-azure.yaml | 10 +++--- docker-compose.yaml | 10 +++--- scripts/requirements.txt | 2 -- 23 files changed, 107 insertions(+), 37 deletions(-) create mode 100644 .github/workflows/pytest.yml create mode 100644 application/__init__.py create mode 100644 application/parser/file/__init__.py create mode 100644 application/parser/schema/__init__.py create mode 100644 application/tests/__init__.py create mode 100644 application/tests/test_app.py diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 00000000..412aca18 --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,28 @@ +name: Run python tests with pytest + +on: [push, pull_request] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + cd application + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Test with pytest + run: | + cd application + pytest diff --git a/application/__init__.py b/application/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/application/app.py b/application/app.py index 156e34a7..5d5a1e56 100644 --- a/application/app.py +++ b/application/app.py @@ -37,9 +37,9 @@ from langchain.schema import HumanMessage, AIMessage from pymongo import MongoClient from werkzeug.utils import secure_filename -from core.settings import settings -from error import bad_request -from worker import ingest_worker +from application.core.settings import settings +from application.error import bad_request +from application.worker import ingest_worker from bson.objectid import ObjectId # os.environ["LANGCHAIN_HANDLER"] = "langchain" diff --git a/application/parser/file/__init__.py b/application/parser/file/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/application/parser/file/__init__.py @@ -0,0 +1 @@ + diff --git a/application/parser/file/base.py b/application/parser/file/base.py index 2fe9a75d..f63e8ef6 100644 --- a/application/parser/file/base.py +++ b/application/parser/file/base.py @@ -3,7 +3,7 @@ from abc import abstractmethod from typing import Any, List from langchain.docstore.document import Document as LCDocument -from parser.schema.base import Document +from application.parser.schema.base import Document class BaseReader: diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index 2be8e328..593681e2 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -3,15 +3,15 @@ import logging from pathlib import Path from typing import Callable, Dict, List, Optional, Union -from parser.file.base import BaseReader -from parser.file.base_parser import BaseParser -from parser.file.docs_parser import DocxParser, PDFParser -from parser.file.epub_parser import EpubParser -from parser.file.html_parser import HTMLParser -from parser.file.markdown_parser import MarkdownParser -from parser.file.rst_parser import RstParser -from parser.file.tabular_parser import PandasCSVParser -from parser.schema.base import Document +from application.parser.file.base import BaseReader +from application.parser.file.base_parser import BaseParser +from application.parser.file.docs_parser import DocxParser, PDFParser +from application.parser.file.epub_parser import EpubParser +from application.parser.file.html_parser import HTMLParser +from application.parser.file.markdown_parser import MarkdownParser +from application.parser.file.rst_parser import RstParser +from application.parser.file.tabular_parser import PandasCSVParser +from application.parser.schema.base import Document DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".pdf": PDFParser(), diff --git a/application/parser/file/docs_parser.py b/application/parser/file/docs_parser.py index 0cde4076..861e8e58 100644 --- a/application/parser/file/docs_parser.py +++ b/application/parser/file/docs_parser.py @@ -6,7 +6,7 @@ Contains parsers for docx, pdf files. from pathlib import Path from typing import Dict -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class PDFParser(BaseParser): diff --git a/application/parser/file/epub_parser.py b/application/parser/file/epub_parser.py index 6ece5ecf..4f5e8711 100644 --- a/application/parser/file/epub_parser.py +++ b/application/parser/file/epub_parser.py @@ -6,7 +6,7 @@ Contains parsers for epub files. from pathlib import Path from typing import Dict -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class EpubParser(BaseParser): diff --git a/application/parser/file/html_parser.py b/application/parser/file/html_parser.py index 96460c7c..f6f885fc 100644 --- a/application/parser/file/html_parser.py +++ b/application/parser/file/html_parser.py @@ -7,7 +7,7 @@ import re from pathlib import Path from typing import Dict, Union -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class HTMLParser(BaseParser): diff --git a/application/parser/file/markdown_parser.py b/application/parser/file/markdown_parser.py index d8aeb3b0..d906e9b6 100644 --- a/application/parser/file/markdown_parser.py +++ b/application/parser/file/markdown_parser.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union, cast import tiktoken -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class MarkdownParser(BaseParser): diff --git a/application/parser/file/rst_parser.py b/application/parser/file/rst_parser.py index f8feff70..4bd0e6f4 100644 --- a/application/parser/file/rst_parser.py +++ b/application/parser/file/rst_parser.py @@ -7,7 +7,7 @@ import re from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class RstParser(BaseParser): diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py index d7c6402a..81355ae0 100644 --- a/application/parser/file/tabular_parser.py +++ b/application/parser/file/tabular_parser.py @@ -6,7 +6,7 @@ Contains parsers for tabular data files. from pathlib import Path from typing import Any, Dict, List, Union -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class CSVParser(BaseParser): diff --git a/application/parser/schema/__init__.py b/application/parser/schema/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/application/parser/schema/__init__.py @@ -0,0 +1 @@ + diff --git a/application/parser/schema/base.py b/application/parser/schema/base.py index 3dafda1a..61670f9a 100644 --- a/application/parser/schema/base.py +++ b/application/parser/schema/base.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from langchain.docstore.document import Document as LCDocument -from parser.schema.schema import BaseDocument +from application.parser.schema.schema import BaseDocument @dataclass diff --git a/application/parser/token_func.py b/application/parser/token_func.py index aada673f..14b231fc 100644 --- a/application/parser/token_func.py +++ b/application/parser/token_func.py @@ -3,7 +3,7 @@ from math import ceil from typing import List import tiktoken -from parser.schema.base import Document +from application.parser.schema.base import Document def separate_header_and_body(text): diff --git a/application/requirements.txt b/application/requirements.txt index fc8d2a85..5bb6780b 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -73,6 +73,7 @@ pymongo==4.3.3 pyowm==3.3.0 PyPDF2==3.0.1 PySocks==1.7.1 +pytest python-dateutil==2.8.2 python-dotenv==1.0.0 python-jose==3.3.0 diff --git a/application/tests/__init__.py b/application/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/application/tests/test_app.py b/application/tests/test_app.py new file mode 100644 index 00000000..5bd3e4f4 --- /dev/null +++ b/application/tests/test_app.py @@ -0,0 +1,37 @@ +from application.app import get_vectorstore + + +# Test cases for get_vectorstore function +def test_no_active_docs(): + data = {} + assert get_vectorstore(data) == "" + + +def test_default_active_docs(): + data = {"active_docs": "default"} + assert get_vectorstore(data) == "" + + +def test_local_default_active_docs(): + data = {"active_docs": "local/default"} + assert get_vectorstore(data) == "" + + +def test_local_custom_active_docs(): + data = {"active_docs": "local/custom_index"} + assert get_vectorstore(data) == "indexes/local/custom_index" + + +def test_remote_active_docs(): + data = {"active_docs": "remote_index"} + assert get_vectorstore(data) == "vectors/remote_index" + + +def test_active_docs_not_in_data(): + data = {"other_key": "value"} + assert get_vectorstore(data) == "" + + +def test_multiple_slashes_in_active_docs(): + data = {"active_docs": "local/some/other/index"} + assert get_vectorstore(data) == "indexes/local/some/other/index" diff --git a/application/worker.py b/application/worker.py index 2a3ff24a..da955a7e 100644 --- a/application/worker.py +++ b/application/worker.py @@ -7,11 +7,11 @@ from urllib.parse import urljoin import nltk import requests -from core.settings import settings -from parser.file.bulk import SimpleDirectoryReader -from parser.open_ai_func import call_openai_api -from parser.schema.base import Document -from parser.token_func import group_split +from application.core.settings import settings +from application.parser.file.bulk import SimpleDirectoryReader +from application.parser.open_ai_func import call_openai_api +from application.parser.schema.base import Document +from application.parser.token_func import group_split try: nltk.download('punkt', quiet=True) diff --git a/application/wsgi.py b/application/wsgi.py index 6b8b4d0c..5160e115 100644 --- a/application/wsgi.py +++ b/application/wsgi.py @@ -1,4 +1,4 @@ -from app import app +from application.app import app if __name__ == "__main__": app.run(debug=True, port=7091) diff --git a/docker-compose-azure.yaml b/docker-compose-azure.yaml index a015eef2..e13d6dd8 100644 --- a/docker-compose-azure.yaml +++ b/docker-compose-azure.yaml @@ -13,6 +13,7 @@ services: backend: build: ./application + working_dir: /application environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY @@ -27,16 +28,17 @@ services: ports: - "7091:7091" volumes: - - ./application/indexes:/app/indexes - - ./application/inputs:/app/inputs - - ./application/vectors:/app/vectors + - ./application/indexes:/application/indexes + - ./application/inputs:/application/inputs + - ./application/vectors:/application/vectors depends_on: - redis - mongo worker: build: ./application - command: celery -A app.celery worker -l INFO + working_dir: /application + command: celery -A application.app.celery worker -l INFO environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY diff --git a/docker-compose.yaml b/docker-compose.yaml index a8917af4..9eb42bbe 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -13,6 +13,7 @@ services: backend: build: ./application + working_dir: /application environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY @@ -22,16 +23,17 @@ services: ports: - "7091:7091" volumes: - - ./application/indexes:/app/indexes - - ./application/inputs:/app/inputs - - ./application/vectors:/app/vectors + - ./application/indexes:/application/indexes + - ./application/inputs:/application/inputs + - ./application/vectors:/application/vectors depends_on: - redis - mongo worker: build: ./application - command: celery -A app.celery worker -l INFO + working_dir: /application + command: celery -A application.app.celery worker -l INFO environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY diff --git a/scripts/requirements.txt b/scripts/requirements.txt index f6b0b451..c56feab4 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -110,8 +110,6 @@ tenacity==8.2.2 threadpoolctl==3.2.0 tiktoken==0.4.0 tokenizers==0.13.3 -torch==2.0.1 -torchvision==0.15.2 tqdm==4.65.0 transformers==4.31.0 typer==0.9.0 From 85f9ae5a0adf1545d621c958bb1edcf2a71795eb Mon Sep 17 00:00:00 2001 From: Anton Larin Date: Sun, 13 Aug 2023 21:00:52 +0200 Subject: [PATCH 2/2] fix packaging and imports and introduce tests with pytest. still issues with celery worker. --- .github/workflows/pytest.yml | 3 +-- application/Dockerfile | 4 ++-- application/app.py | 14 +++++++------ application/tests/__init__.py | 0 application/tests/test_app.py | 37 ----------------------------------- docker-compose-azure.yaml | 8 +++----- docker-compose.yaml | 8 +++----- tests/test_app.py | 28 ++++++++++++++++++++++++++ 8 files changed, 45 insertions(+), 57 deletions(-) delete mode 100644 application/tests/__init__.py delete mode 100644 application/tests/test_app.py create mode 100644 tests/test_app.py diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 412aca18..3ac87141 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -24,5 +24,4 @@ jobs: if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Test with pytest run: | - cd application - pytest + python -m pytest diff --git a/application/Dockerfile b/application/Dockerfile index 12859724..8c083926 100644 --- a/application/Dockerfile +++ b/application/Dockerfile @@ -14,10 +14,10 @@ FROM python:3.10-slim-bullseye COPY --from=builder /usr/local/ /usr/local/ WORKDIR /app -COPY . /app +COPY . /app/application ENV FLASK_APP=app.py ENV FLASK_DEBUG=true EXPOSE 7091 -CMD ["gunicorn", "-w", "2", "--timeout", "120", "--bind", "0.0.0.0:7091", "wsgi:app"] +CMD ["gunicorn", "-w", "2", "--timeout", "120", "--bind", "0.0.0.0:7091", "application.wsgi:app"] diff --git a/application/app.py b/application/app.py index 5d5a1e56..3ba2de69 100644 --- a/application/app.py +++ b/application/app.py @@ -68,19 +68,20 @@ if platform.system() == "Windows": dotenv.load_dotenv() # load the prompts -with open("prompts/combine_prompt.txt", "r") as f: +current_dir = os.path.dirname(os.path.abspath(__file__)) +with open(os.path.join(current_dir, "prompts", "combine_prompt.txt"), "r") as f: template = f.read() -with open("prompts/combine_prompt_hist.txt", "r") as f: +with open(os.path.join(current_dir, "prompts", "combine_prompt_hist.txt"), "r") as f: template_hist = f.read() -with open("prompts/question_prompt.txt", "r") as f: +with open(os.path.join(current_dir, "prompts", "question_prompt.txt"), "r") as f: template_quest = f.read() -with open("prompts/chat_combine_prompt.txt", "r") as f: +with open(os.path.join(current_dir, "prompts", "chat_combine_prompt.txt"), "r") as f: chat_combine_template = f.read() -with open("prompts/chat_reduce_prompt.txt", "r") as f: +with open(os.path.join(current_dir, "prompts", "chat_reduce_prompt.txt"), "r") as f: chat_reduce_template = f.read() api_key_set = settings.API_KEY is not None @@ -92,7 +93,7 @@ app.config["CELERY_BROKER_URL"] = settings.CELERY_BROKER_URL app.config["CELERY_RESULT_BACKEND"] = settings.CELERY_RESULT_BACKEND app.config["MONGO_URI"] = settings.MONGO_URI celery = Celery() -celery.config_from_object("celeryconfig") +celery.config_from_object("application.celeryconfig") mongo = MongoClient(app.config["MONGO_URI"]) db = mongo["docsgpt"] vectors_collection = db["vectors"] @@ -129,6 +130,7 @@ def get_vectorstore(data): vectorstore = "" else: vectorstore = "" + vectorstore = os.path.join("application", vectorstore) return vectorstore diff --git a/application/tests/__init__.py b/application/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/application/tests/test_app.py b/application/tests/test_app.py deleted file mode 100644 index 5bd3e4f4..00000000 --- a/application/tests/test_app.py +++ /dev/null @@ -1,37 +0,0 @@ -from application.app import get_vectorstore - - -# Test cases for get_vectorstore function -def test_no_active_docs(): - data = {} - assert get_vectorstore(data) == "" - - -def test_default_active_docs(): - data = {"active_docs": "default"} - assert get_vectorstore(data) == "" - - -def test_local_default_active_docs(): - data = {"active_docs": "local/default"} - assert get_vectorstore(data) == "" - - -def test_local_custom_active_docs(): - data = {"active_docs": "local/custom_index"} - assert get_vectorstore(data) == "indexes/local/custom_index" - - -def test_remote_active_docs(): - data = {"active_docs": "remote_index"} - assert get_vectorstore(data) == "vectors/remote_index" - - -def test_active_docs_not_in_data(): - data = {"other_key": "value"} - assert get_vectorstore(data) == "" - - -def test_multiple_slashes_in_active_docs(): - data = {"active_docs": "local/some/other/index"} - assert get_vectorstore(data) == "indexes/local/some/other/index" diff --git a/docker-compose-azure.yaml b/docker-compose-azure.yaml index e13d6dd8..70a16808 100644 --- a/docker-compose-azure.yaml +++ b/docker-compose-azure.yaml @@ -13,7 +13,6 @@ services: backend: build: ./application - working_dir: /application environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY @@ -28,16 +27,15 @@ services: ports: - "7091:7091" volumes: - - ./application/indexes:/application/indexes - - ./application/inputs:/application/inputs - - ./application/vectors:/application/vectors + - ./application/indexes:/app/application/indexes + - ./application/inputs:/app/application/inputs + - ./application/vectors:/app/application/vectors depends_on: - redis - mongo worker: build: ./application - working_dir: /application command: celery -A application.app.celery worker -l INFO environment: - API_KEY=$OPENAI_API_KEY diff --git a/docker-compose.yaml b/docker-compose.yaml index 9eb42bbe..d5dd10e5 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -13,7 +13,6 @@ services: backend: build: ./application - working_dir: /application environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY @@ -23,16 +22,15 @@ services: ports: - "7091:7091" volumes: - - ./application/indexes:/application/indexes - - ./application/inputs:/application/inputs - - ./application/vectors:/application/vectors + - ./application/indexes:/app/application/indexes + - ./application/inputs:/app/application/inputs + - ./application/vectors:/app/application/vectors depends_on: - redis - mongo worker: build: ./application - working_dir: /application command: celery -A application.app.celery worker -l INFO environment: - API_KEY=$OPENAI_API_KEY diff --git a/tests/test_app.py b/tests/test_app.py new file mode 100644 index 00000000..8ae9ee34 --- /dev/null +++ b/tests/test_app.py @@ -0,0 +1,28 @@ +from application.app import get_vectorstore +import os + + +# Test cases for get_vectorstore function +def test_no_active_docs(): + data = {} + assert get_vectorstore(data) == os.path.join("application", "") + + +def test_local_default_active_docs(): + data = {"active_docs": "local/default"} + assert get_vectorstore(data) == os.path.join("application", "") + + +def test_local_non_default_active_docs(): + data = {"active_docs": "local/something"} + assert get_vectorstore(data) == os.path.join("application", "indexes/local/something") + + +def test_default_active_docs(): + data = {"active_docs": "default"} + assert get_vectorstore(data) == os.path.join("application", "") + + +def test_complex_active_docs(): + data = {"active_docs": "local/other/path"} + assert get_vectorstore(data) == os.path.join("application", "indexes/local/other/path")