From 98a97f34f51e363a3bca5fc108a310faf1249462 Mon Sep 17 00:00:00 2001 From: Anton Larin Date: Sun, 13 Aug 2023 19:25:55 +0200 Subject: [PATCH] fix packaging and imports and introduce tests with pytest. still issues with celery worker. --- .github/workflows/pytest.yml | 28 ++++++++++++++++ application/__init__.py | 0 application/app.py | 6 ++-- application/parser/file/__init__.py | 1 + application/parser/file/base.py | 2 +- application/parser/file/bulk.py | 18 +++++------ application/parser/file/docs_parser.py | 2 +- application/parser/file/epub_parser.py | 2 +- application/parser/file/html_parser.py | 2 +- application/parser/file/markdown_parser.py | 2 +- application/parser/file/rst_parser.py | 2 +- application/parser/file/tabular_parser.py | 2 +- application/parser/schema/__init__.py | 1 + application/parser/schema/base.py | 2 +- application/parser/token_func.py | 2 +- application/requirements.txt | 1 + application/tests/__init__.py | 0 application/tests/test_app.py | 37 ++++++++++++++++++++++ application/worker.py | 10 +++--- application/wsgi.py | 2 +- docker-compose-azure.yaml | 10 +++--- docker-compose.yaml | 10 +++--- scripts/requirements.txt | 2 -- 23 files changed, 107 insertions(+), 37 deletions(-) create mode 100644 .github/workflows/pytest.yml create mode 100644 application/__init__.py create mode 100644 application/parser/file/__init__.py create mode 100644 application/parser/schema/__init__.py create mode 100644 application/tests/__init__.py create mode 100644 application/tests/test_app.py diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 00000000..412aca18 --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,28 @@ +name: Run python tests with pytest + +on: [push, pull_request] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + cd application + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Test with pytest + run: | + cd application + pytest diff --git a/application/__init__.py b/application/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/application/app.py b/application/app.py index 156e34a7..5d5a1e56 100644 --- a/application/app.py +++ b/application/app.py @@ -37,9 +37,9 @@ from langchain.schema import HumanMessage, AIMessage from pymongo import MongoClient from werkzeug.utils import secure_filename -from core.settings import settings -from error import bad_request -from worker import ingest_worker +from application.core.settings import settings +from application.error import bad_request +from application.worker import ingest_worker from bson.objectid import ObjectId # os.environ["LANGCHAIN_HANDLER"] = "langchain" diff --git a/application/parser/file/__init__.py b/application/parser/file/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/application/parser/file/__init__.py @@ -0,0 +1 @@ + diff --git a/application/parser/file/base.py b/application/parser/file/base.py index 2fe9a75d..f63e8ef6 100644 --- a/application/parser/file/base.py +++ b/application/parser/file/base.py @@ -3,7 +3,7 @@ from abc import abstractmethod from typing import Any, List from langchain.docstore.document import Document as LCDocument -from parser.schema.base import Document +from application.parser.schema.base import Document class BaseReader: diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index 2be8e328..593681e2 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -3,15 +3,15 @@ import logging from pathlib import Path from typing import Callable, Dict, List, Optional, Union -from parser.file.base import BaseReader -from parser.file.base_parser import BaseParser -from parser.file.docs_parser import DocxParser, PDFParser -from parser.file.epub_parser import EpubParser -from parser.file.html_parser import HTMLParser -from parser.file.markdown_parser import MarkdownParser -from parser.file.rst_parser import RstParser -from parser.file.tabular_parser import PandasCSVParser -from parser.schema.base import Document +from application.parser.file.base import BaseReader +from application.parser.file.base_parser import BaseParser +from application.parser.file.docs_parser import DocxParser, PDFParser +from application.parser.file.epub_parser import EpubParser +from application.parser.file.html_parser import HTMLParser +from application.parser.file.markdown_parser import MarkdownParser +from application.parser.file.rst_parser import RstParser +from application.parser.file.tabular_parser import PandasCSVParser +from application.parser.schema.base import Document DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".pdf": PDFParser(), diff --git a/application/parser/file/docs_parser.py b/application/parser/file/docs_parser.py index 0cde4076..861e8e58 100644 --- a/application/parser/file/docs_parser.py +++ b/application/parser/file/docs_parser.py @@ -6,7 +6,7 @@ Contains parsers for docx, pdf files. from pathlib import Path from typing import Dict -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class PDFParser(BaseParser): diff --git a/application/parser/file/epub_parser.py b/application/parser/file/epub_parser.py index 6ece5ecf..4f5e8711 100644 --- a/application/parser/file/epub_parser.py +++ b/application/parser/file/epub_parser.py @@ -6,7 +6,7 @@ Contains parsers for epub files. from pathlib import Path from typing import Dict -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class EpubParser(BaseParser): diff --git a/application/parser/file/html_parser.py b/application/parser/file/html_parser.py index 96460c7c..f6f885fc 100644 --- a/application/parser/file/html_parser.py +++ b/application/parser/file/html_parser.py @@ -7,7 +7,7 @@ import re from pathlib import Path from typing import Dict, Union -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class HTMLParser(BaseParser): diff --git a/application/parser/file/markdown_parser.py b/application/parser/file/markdown_parser.py index d8aeb3b0..d906e9b6 100644 --- a/application/parser/file/markdown_parser.py +++ b/application/parser/file/markdown_parser.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union, cast import tiktoken -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class MarkdownParser(BaseParser): diff --git a/application/parser/file/rst_parser.py b/application/parser/file/rst_parser.py index f8feff70..4bd0e6f4 100644 --- a/application/parser/file/rst_parser.py +++ b/application/parser/file/rst_parser.py @@ -7,7 +7,7 @@ import re from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class RstParser(BaseParser): diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py index d7c6402a..81355ae0 100644 --- a/application/parser/file/tabular_parser.py +++ b/application/parser/file/tabular_parser.py @@ -6,7 +6,7 @@ Contains parsers for tabular data files. from pathlib import Path from typing import Any, Dict, List, Union -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class CSVParser(BaseParser): diff --git a/application/parser/schema/__init__.py b/application/parser/schema/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/application/parser/schema/__init__.py @@ -0,0 +1 @@ + diff --git a/application/parser/schema/base.py b/application/parser/schema/base.py index 3dafda1a..61670f9a 100644 --- a/application/parser/schema/base.py +++ b/application/parser/schema/base.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from langchain.docstore.document import Document as LCDocument -from parser.schema.schema import BaseDocument +from application.parser.schema.schema import BaseDocument @dataclass diff --git a/application/parser/token_func.py b/application/parser/token_func.py index aada673f..14b231fc 100644 --- a/application/parser/token_func.py +++ b/application/parser/token_func.py @@ -3,7 +3,7 @@ from math import ceil from typing import List import tiktoken -from parser.schema.base import Document +from application.parser.schema.base import Document def separate_header_and_body(text): diff --git a/application/requirements.txt b/application/requirements.txt index fc8d2a85..5bb6780b 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -73,6 +73,7 @@ pymongo==4.3.3 pyowm==3.3.0 PyPDF2==3.0.1 PySocks==1.7.1 +pytest python-dateutil==2.8.2 python-dotenv==1.0.0 python-jose==3.3.0 diff --git a/application/tests/__init__.py b/application/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/application/tests/test_app.py b/application/tests/test_app.py new file mode 100644 index 00000000..5bd3e4f4 --- /dev/null +++ b/application/tests/test_app.py @@ -0,0 +1,37 @@ +from application.app import get_vectorstore + + +# Test cases for get_vectorstore function +def test_no_active_docs(): + data = {} + assert get_vectorstore(data) == "" + + +def test_default_active_docs(): + data = {"active_docs": "default"} + assert get_vectorstore(data) == "" + + +def test_local_default_active_docs(): + data = {"active_docs": "local/default"} + assert get_vectorstore(data) == "" + + +def test_local_custom_active_docs(): + data = {"active_docs": "local/custom_index"} + assert get_vectorstore(data) == "indexes/local/custom_index" + + +def test_remote_active_docs(): + data = {"active_docs": "remote_index"} + assert get_vectorstore(data) == "vectors/remote_index" + + +def test_active_docs_not_in_data(): + data = {"other_key": "value"} + assert get_vectorstore(data) == "" + + +def test_multiple_slashes_in_active_docs(): + data = {"active_docs": "local/some/other/index"} + assert get_vectorstore(data) == "indexes/local/some/other/index" diff --git a/application/worker.py b/application/worker.py index 2a3ff24a..da955a7e 100644 --- a/application/worker.py +++ b/application/worker.py @@ -7,11 +7,11 @@ from urllib.parse import urljoin import nltk import requests -from core.settings import settings -from parser.file.bulk import SimpleDirectoryReader -from parser.open_ai_func import call_openai_api -from parser.schema.base import Document -from parser.token_func import group_split +from application.core.settings import settings +from application.parser.file.bulk import SimpleDirectoryReader +from application.parser.open_ai_func import call_openai_api +from application.parser.schema.base import Document +from application.parser.token_func import group_split try: nltk.download('punkt', quiet=True) diff --git a/application/wsgi.py b/application/wsgi.py index 6b8b4d0c..5160e115 100644 --- a/application/wsgi.py +++ b/application/wsgi.py @@ -1,4 +1,4 @@ -from app import app +from application.app import app if __name__ == "__main__": app.run(debug=True, port=7091) diff --git a/docker-compose-azure.yaml b/docker-compose-azure.yaml index a015eef2..e13d6dd8 100644 --- a/docker-compose-azure.yaml +++ b/docker-compose-azure.yaml @@ -13,6 +13,7 @@ services: backend: build: ./application + working_dir: /application environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY @@ -27,16 +28,17 @@ services: ports: - "7091:7091" volumes: - - ./application/indexes:/app/indexes - - ./application/inputs:/app/inputs - - ./application/vectors:/app/vectors + - ./application/indexes:/application/indexes + - ./application/inputs:/application/inputs + - ./application/vectors:/application/vectors depends_on: - redis - mongo worker: build: ./application - command: celery -A app.celery worker -l INFO + working_dir: /application + command: celery -A application.app.celery worker -l INFO environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY diff --git a/docker-compose.yaml b/docker-compose.yaml index a8917af4..9eb42bbe 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -13,6 +13,7 @@ services: backend: build: ./application + working_dir: /application environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY @@ -22,16 +23,17 @@ services: ports: - "7091:7091" volumes: - - ./application/indexes:/app/indexes - - ./application/inputs:/app/inputs - - ./application/vectors:/app/vectors + - ./application/indexes:/application/indexes + - ./application/inputs:/application/inputs + - ./application/vectors:/application/vectors depends_on: - redis - mongo worker: build: ./application - command: celery -A app.celery worker -l INFO + working_dir: /application + command: celery -A application.app.celery worker -l INFO environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY diff --git a/scripts/requirements.txt b/scripts/requirements.txt index f6b0b451..c56feab4 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -110,8 +110,6 @@ tenacity==8.2.2 threadpoolctl==3.2.0 tiktoken==0.4.0 tokenizers==0.13.3 -torch==2.0.1 -torchvision==0.15.2 tqdm==4.65.0 transformers==4.31.0 typer==0.9.0