Files
DocsGPT/application/updates/version_check.py
2026-04-21 14:22:32 +01:00

303 lines
10 KiB
Python

"""Anonymous startup version-check client.
Called once per Celery worker boot (see ``application/celery_init.py``
``worker_ready`` handler). Posts the running version + anonymous
instance UUID to ``gptcloud.arc53.com/api/check``, caches the response
in Redis, and surfaces any advisories to stdout + logs.
Design invariants — all enforced by a broad ``try/except`` at the top
of :func:`run_check`:
* Never blocks worker startup (fired from a daemon thread).
* Never raises to the caller (every failure is swallowed + logged at
``DEBUG``).
* Opt-out via ``VERSION_CHECK=0`` short-circuits before any Postgres
write, Redis access, or outbound request.
* Redis coordinates multi-worker and multi-replica deployments — the
first worker to acquire ``docsgpt:version_check:lock`` fetches, the
rest read from the cached response on the next cycle.
"""
from __future__ import annotations
import json
import logging
import os
import platform
import socket
import sys
from typing import Any, Dict, Optional
import requests
from application.cache import get_redis_instance
from application.core.settings import settings
from application.storage.db.repositories.app_metadata import AppMetadataRepository
from application.storage.db.session import db_session
from application.version import get_version
logger = logging.getLogger(__name__)
ENDPOINT_URL = "https://gptcloud.arc53.com/api/check"
CLIENT_NAME = "docsgpt-backend"
REQUEST_TIMEOUT_SECONDS = 5
CACHE_KEY = "docsgpt:version_check:response"
LOCK_KEY = "docsgpt:version_check:lock"
CACHE_TTL_SECONDS = 6 * 3600 # 6h default; shortened by response `next_check_after`.
LOCK_TTL_SECONDS = 60
NOTICE_KEY = "version_check_notice_shown"
INSTANCE_ID_KEY = "instance_id"
_HIGH_SEVERITIES = {"high", "critical"}
_ANSI_RESET = "\033[0m"
_ANSI_RED = "\033[31m"
_ANSI_YELLOW = "\033[33m"
def run_check() -> None:
"""Entry point for the worker-startup daemon thread.
Safe to call unconditionally: the opt-out, Redis-outage, and
Postgres-outage paths all return silently. No exception propagates.
"""
try:
_run_check_inner()
except Exception as exc: # noqa: BLE001 — belt-and-braces; nothing escapes.
logger.debug("version check crashed: %s", exc, exc_info=True)
def _run_check_inner() -> None:
if not settings.VERSION_CHECK:
return
instance_id = _resolve_instance_id_and_notice()
if instance_id is None:
# Postgres unavailable — per spec we skip the check entirely
# rather than phone home with a synthetic/ephemeral UUID.
return
redis_client = get_redis_instance()
cached = _read_cache(redis_client)
if cached is not None:
_render_advisories(cached)
return
# Cache miss. Try to win the lock; if another worker has it, skip.
# ``redis_client is None`` here means Redis is unreachable — per the
# spec we still proceed uncached (acceptable duplicate calls in
# multi-worker Redis-less deploys).
if redis_client is not None and not _acquire_lock(redis_client):
return
response = _fetch(instance_id)
if response is None:
if redis_client is not None:
_release_lock(redis_client)
return
_write_cache(redis_client, response)
_render_advisories(response)
if redis_client is not None:
_release_lock(redis_client)
def _resolve_instance_id_and_notice() -> Optional[str]:
"""Load (or create) the instance UUID and emit the first-run notice.
The notice is printed at most once across the lifetime of the
installation — tracked via the ``version_check_notice_shown`` row
in ``app_metadata``. Both reads and the write happen inside one
short transaction so two racing workers can't each emit the notice.
"""
try:
with db_session() as conn:
repo = AppMetadataRepository(conn)
instance_id = repo.get_or_create_instance_id()
if repo.get(NOTICE_KEY) is None:
_print_first_run_notice()
repo.set(NOTICE_KEY, "1")
return instance_id
except Exception as exc: # noqa: BLE001 — Postgres down, bad URI, etc.
logger.debug("version check: Postgres unavailable (%s)", exc, exc_info=True)
return None
def _print_first_run_notice() -> None:
message = (
"Anonymous version check enabled — sends version to "
"gptcloud.arc53.com.\nDisable with VERSION_CHECK=0."
)
print(message, flush=True)
logger.info("version check: first-run notice shown")
def _read_cache(redis_client) -> Optional[Dict[str, Any]]:
if redis_client is None:
return None
try:
raw = redis_client.get(CACHE_KEY)
except Exception as exc: # noqa: BLE001 — Redis transient errors.
logger.debug("version check: cache GET failed (%s)", exc, exc_info=True)
return None
if raw is None:
return None
try:
return json.loads(raw.decode("utf-8") if isinstance(raw, bytes) else raw)
except (ValueError, AttributeError) as exc:
logger.debug("version check: cache decode failed (%s)", exc, exc_info=True)
return None
def _write_cache(redis_client, response: Dict[str, Any]) -> None:
if redis_client is None:
return
ttl = _compute_ttl(response)
try:
redis_client.setex(CACHE_KEY, ttl, json.dumps(response))
except Exception as exc: # noqa: BLE001
logger.debug("version check: cache SETEX failed (%s)", exc, exc_info=True)
def _compute_ttl(response: Dict[str, Any]) -> int:
"""Cap the cache at 6h but honor a shorter server-specified window."""
next_after = response.get("next_check_after")
if isinstance(next_after, (int, float)) and next_after > 0:
return max(1, min(CACHE_TTL_SECONDS, int(next_after)))
return CACHE_TTL_SECONDS
def _acquire_lock(redis_client) -> bool:
try:
owner = f"{socket.gethostname()}:{os.getpid()}"
return bool(
redis_client.set(LOCK_KEY, owner, nx=True, ex=LOCK_TTL_SECONDS)
)
except Exception as exc: # noqa: BLE001
# Treat a failing Redis the same as "no lock infra" — skip rather
# than fire without coordination, because Redis outage is
# usually transient and one missed cycle is harmless.
logger.debug("version check: lock acquire failed (%s)", exc, exc_info=True)
return False
def _release_lock(redis_client) -> None:
try:
redis_client.delete(LOCK_KEY)
except Exception as exc: # noqa: BLE001
logger.debug("version check: lock release failed (%s)", exc, exc_info=True)
def _fetch(instance_id: str) -> Optional[Dict[str, Any]]:
version = get_version()
if version in ("", "unknown"):
# The endpoint rejects payloads without a valid semver, and the
# rejection is otherwise logged at DEBUG — invisible under the
# usual ``-l INFO`` Celery worker start. Surface it loudly so a
# misconfigured release (missing or unset ``__version__``) is
# obvious instead of silently disabling the check.
logger.warning(
"version check: skipping — get_version() returned %r. "
"Set __version__ in application/version.py to a valid "
"version string.",
version,
)
return None
payload = {
"version": version,
"instance_id": instance_id,
"python_version": platform.python_version(),
"platform": sys.platform,
"client": CLIENT_NAME,
}
try:
resp = requests.post(
ENDPOINT_URL,
json=payload,
timeout=REQUEST_TIMEOUT_SECONDS,
)
except requests.RequestException as exc:
logger.debug("version check: request failed (%s)", exc, exc_info=True)
return None
if resp.status_code >= 400:
logger.debug("version check: non-2xx response %s", resp.status_code)
return None
try:
return resp.json()
except ValueError as exc:
logger.debug("version check: response decode failed (%s)", exc, exc_info=True)
return None
def _render_advisories(response: Dict[str, Any]) -> None:
advisories = response.get("advisories") or []
if not isinstance(advisories, list):
return
current_version = get_version()
for advisory in advisories:
if not isinstance(advisory, dict):
continue
severity = str(advisory.get("severity", "")).lower()
advisory_id = advisory.get("id", "UNKNOWN")
title = advisory.get("title", "")
url = advisory.get("url", "")
fixed_in = advisory.get("fixed_in")
summary = advisory.get(
"summary",
f"Your DocsGPT version {current_version} is vulnerable.",
)
logger.warning(
"security advisory %s (severity=%s) affects version %s: %s%s%s",
advisory_id,
severity or "unknown",
current_version,
title or summary,
f" — fixed in {fixed_in}" if fixed_in else "",
f"{url}" if url else "",
)
if severity in _HIGH_SEVERITIES:
_print_console_advisory(
advisory_id=advisory_id,
title=title,
severity=severity,
summary=summary,
fixed_in=fixed_in,
url=url,
)
def _print_console_advisory(
*,
advisory_id: str,
title: str,
severity: str,
summary: str,
fixed_in: Optional[str],
url: str,
) -> None:
color = _ANSI_RED if severity == "critical" else _ANSI_YELLOW
bar = "=" * 60
upgrade_line = ""
if fixed_in and url:
upgrade_line = f" Upgrade to {fixed_in}+ — {url}"
elif fixed_in:
upgrade_line = f" Upgrade to {fixed_in}+"
elif url:
upgrade_line = f" {url}"
lines = [
bar,
f"\u26a0 SECURITY ADVISORY: {advisory_id}",
f" {summary}",
f" {title} (severity: {severity})" if title else f" severity: {severity}",
]
if upgrade_line:
lines.append(upgrade_line)
lines.append(bar)
print(f"{color}{chr(10).join(lines)}{_ANSI_RESET}", flush=True)