feat: add error telemetry with stage tracking

sends minimal data to scarf on install/update failures:
event type, stage, exit_code. uses err trap in orchestrator
scripts. respects existing opt-out settings.
This commit is contained in:
Yury Kossakovsky
2025-12-25 17:22:18 -07:00
parent a86cf8893c
commit a1521f8a48
6 changed files with 109 additions and 14 deletions

View File

@@ -98,7 +98,7 @@ DATABASES=(
"lightrag"
"postiz"
"waha"
"$ARGUMENTS" # Add your service here
"new_data_base_name" # Add your service here
)
```
@@ -114,10 +114,10 @@ This script:
Example in docker-compose.yml:
```yaml
environment:
DATABASE_URL: postgresql://postgres:${POSTGRES_PASSWORD}@postgres:5432/$ARGUMENTS
DATABASE_URL: postgresql://postgres:${POSTGRES_PASSWORD}@postgres:5432/new_data_base_name
# OR individual vars:
POSTGRES_HOST: postgres
POSTGRES_DATABASE: $ARGUMENTS
POSTGRES_DATABASE: new_data_base_name
```
### 1.6 Proxy Configuration (for outbound AI API calls)
@@ -563,14 +563,14 @@ Add to `SERVICE_METADATA` object (~line 145):
### Categories
| Category | Description | Examples |
|----------|-------------|----------|
| `ai` | AI/ML services | Flowise, LangChain, Ollama, LightRAG |
| `database` | Data storage | PostgreSQL, Qdrant, Weaviate, Neo4j |
| `monitoring` | Observability | Prometheus, Grafana, Langfuse |
| `tools` | Utilities | Gotenberg, Docling, LibreTranslate, PaddleOCR |
| `infra` | Infrastructure | Caddy, Redis, Gost, Portainer |
| `automation` | Workflow automation | n8n, Postiz |
| Category | Description | Examples |
| ------------ | ------------------- | --------------------------------------------- |
| `ai` | AI/ML services | Flowise, LangChain, Ollama, LightRAG |
| `database` | Data storage | PostgreSQL, Qdrant, Weaviate, Neo4j |
| `monitoring` | Observability | Prometheus, Grafana, Langfuse |
| `tools` | Utilities | Gotenberg, Docling, LibreTranslate, PaddleOCR |
| `infra` | Infrastructure | Caddy, Redis, Gost, Portainer |
| `automation` | Workflow automation | n8n, Postiz |
### Color Examples

View File

@@ -4,7 +4,6 @@
# [optional]
# Anonymous telemetry - helps improve the project
# Set to false to disable (default: true)
# INSTALLATION_ID is auto-generated, do not modify
############
# SCARF_ANALYTICS=false

View File

@@ -21,6 +21,9 @@ set -e
source "$(dirname "$0")/utils.sh"
init_paths
# Setup error telemetry trap for tracking failures
setup_error_telemetry_trap
# Set the compose command explicitly to use docker compose subcommand
COMPOSE_CMD="docker compose"
@@ -35,7 +38,8 @@ cd "$PROJECT_ROOT"
# Send telemetry: update started
send_telemetry "update_start"
# --- Call 03_generate_secrets.sh in update mode ---
# --- Call 03_generate_secrets.sh in update mode ---
set_telemetry_stage "update_env"
log_info "Ensuring .env file is up-to-date with all variables..."
bash "$SCRIPT_DIR/03_generate_secrets.sh" --update || {
log_error "Failed to update .env configuration via 03_generate_secrets.sh. Update process cannot continue."
@@ -44,7 +48,8 @@ bash "$SCRIPT_DIR/03_generate_secrets.sh" --update || {
log_success ".env file updated successfully."
# --- End of .env update by 03_generate_secrets.sh ---
# --- Run Service Selection Wizard FIRST to get updated profiles ---
# --- Run Service Selection Wizard FIRST to get updated profiles ---
set_telemetry_stage "update_wizard"
log_info "Running Service Selection Wizard to update service choices..."
bash "$SCRIPT_DIR/04_wizard.sh" || {
log_error "Service Selection Wizard failed. Update process cannot continue."
@@ -54,6 +59,7 @@ log_success "Service selection updated."
# --- End of Service Selection Wizard ---
# --- Configure Services (prompts and .env updates) ---
set_telemetry_stage "update_configure"
log_info "Configuring services (.env updates for optional inputs)..."
bash "$SCRIPT_DIR/05_configure_services.sh" || {
log_error "Configure Services failed. Update process cannot continue."
@@ -65,6 +71,7 @@ log_success "Service configuration completed."
cleanup_legacy_n8n_workers
# Pull latest versions of selected containers based on updated .env
set_telemetry_stage "update_docker_pull"
log_info "Pulling latest versions of selected containers..."
COMPOSE_FILES_FOR_PULL=("-f" "$PROJECT_ROOT/docker-compose.yml")
@@ -96,6 +103,7 @@ $COMPOSE_CMD -p "localai" "${COMPOSE_FILES_FOR_PULL[@]}" pull --ignore-buildable
}
# Start PostgreSQL first to initialize databases before other services
set_telemetry_stage "update_db_init"
log_info "Starting PostgreSQL..."
$COMPOSE_CMD -p "localai" up -d postgres || { log_error "Failed to start PostgreSQL"; exit 1; }
@@ -104,12 +112,14 @@ $COMPOSE_CMD -p "localai" up -d postgres || { log_error "Failed to start Postgre
bash "$SCRIPT_DIR/init_databases.sh" || { log_warning "Database initialization had issues, but continuing..."; }
# Start all services using the 06_run_services.sh script (postgres is already running)
set_telemetry_stage "update_services_start"
log_info "Running Services..."
bash "$RUN_SERVICES_SCRIPT" || { log_error "Failed to start services. Check logs for details."; exit 1; }
log_success "Update application completed successfully!"
# --- Fix file permissions ---
set_telemetry_stage "update_fix_perms"
log_info "Fixing file permissions..."
bash "$SCRIPT_DIR/08_fix_permissions.sh" || {
log_warning "Failed to fix file permissions. This does not affect the update."
@@ -117,6 +127,7 @@ bash "$SCRIPT_DIR/08_fix_permissions.sh" || {
# --- End of Fix permissions ---
# --- Display Final Report with Credentials ---
set_telemetry_stage "update_final_report"
bash "$SCRIPT_DIR/07_final_report.sh" || {
log_warning "Failed to display the final report. This does not affect the update."
# We don't exit 1 here as the update itself was successful.

View File

@@ -47,6 +47,9 @@ fi
# Initialize paths using utils.sh helper
init_paths
# Setup error telemetry trap for tracking failures
setup_error_telemetry_trap
# Generate installation ID for telemetry correlation (before .env exists)
# This ID will be saved to .env by 03_generate_secrets.sh
INSTALLATION_ID=$(get_installation_id)
@@ -108,26 +111,32 @@ fi
# Run installation steps sequentially using their full paths
show_step 1 8 "System Preparation"
set_telemetry_stage "system_prep"
bash "$SCRIPT_DIR/01_system_preparation.sh" || { log_error "System Preparation failed"; exit 1; }
log_success "System preparation complete!"
show_step 2 8 "Installing Docker"
set_telemetry_stage "docker_install"
bash "$SCRIPT_DIR/02_install_docker.sh" || { log_error "Docker Installation failed"; exit 1; }
log_success "Docker installation complete!"
show_step 3 8 "Generating Secrets and Configuration"
set_telemetry_stage "secrets_gen"
bash "$SCRIPT_DIR/03_generate_secrets.sh" || { log_error "Secret/Config Generation failed"; exit 1; }
log_success "Secret/Config Generation complete!"
show_step 4 8 "Running Service Selection Wizard"
set_telemetry_stage "wizard"
bash "$SCRIPT_DIR/04_wizard.sh" || { log_error "Service Selection Wizard failed"; exit 1; }
log_success "Service Selection Wizard complete!"
show_step 5 8 "Configure Services"
set_telemetry_stage "configure"
bash "$SCRIPT_DIR/05_configure_services.sh" || { log_error "Configure Services failed"; exit 1; }
log_success "Configure Services complete!"
show_step 6 8 "Running Services"
set_telemetry_stage "db_init"
# Start PostgreSQL first to initialize databases before other services
log_info "Starting PostgreSQL..."
docker compose -p localai up -d postgres || { log_error "Failed to start PostgreSQL"; exit 1; }
@@ -137,10 +146,12 @@ docker compose -p localai up -d postgres || { log_error "Failed to start Postgre
bash "$SCRIPT_DIR/init_databases.sh" || { log_warning "Database initialization had issues, but continuing..."; }
# Now start all services (postgres is already running)
set_telemetry_stage "services_start"
bash "$SCRIPT_DIR/06_run_services.sh" || { log_error "Running Services failed"; exit 1; }
log_success "Running Services complete!"
show_step 7 8 "Generating Final Report"
set_telemetry_stage "final_report"
# --- Installation Summary ---
log_info "Installation Summary:"
echo -e " ${GREEN}*${NC} System updated and basic utilities installed"
@@ -155,6 +166,7 @@ bash "$SCRIPT_DIR/07_final_report.sh" || { log_error "Final Report Generation fa
log_success "Final Report generated!"
show_step 8 8 "Fixing File Permissions"
set_telemetry_stage "fix_perms"
bash "$SCRIPT_DIR/08_fix_permissions.sh" || { log_error "Fix Permissions failed"; exit 1; }
log_success "File permissions fixed!"

View File

@@ -24,6 +24,9 @@ set -e
source "$(dirname "$0")/utils.sh"
init_paths
# Setup error telemetry trap for tracking failures
setup_error_telemetry_trap
# Global variable to track backup path for cleanup
BACKUP_PATH=""
@@ -49,6 +52,7 @@ fi
log_info "Starting update process..."
set_telemetry_stage "git_update"
# Pull the latest repository changes
log_info "Pulling latest repository changes..."
@@ -105,6 +109,7 @@ else
fi
# Update Ubuntu packages before running apply_update
set_telemetry_stage "git_system_packages"
log_info "Updating system packages..."
if command -v apt-get &> /dev/null; then
sudo apt-get update && sudo apt-get upgrade -y
@@ -115,6 +120,7 @@ fi
# Execute the rest of the update process using the (potentially updated) apply_update.sh
# Note: apply_update.sh has its own error telemetry trap and stages
bash "$APPLY_UPDATE_SCRIPT"
exit 0

View File

@@ -913,3 +913,70 @@ send_telemetry() {
# Send telemetry in background with short timeout (non-blocking, silent)
curl -sf --connect-timeout 2 --max-time 2 "$url" >/dev/null 2>&1 &
}
#=============================================================================
# ERROR TELEMETRY
#=============================================================================
# Tracks installation/update failures with stage and exit code.
# Uses same opt-out as regular telemetry (SCARF_ANALYTICS=false or DO_NOT_TRACK=1)
# Set current telemetry stage for error tracking
# Usage: set_telemetry_stage "system_prep"
set_telemetry_stage() {
export TELEMETRY_STAGE="$1"
}
# Send error telemetry event (called by trap handler)
# Usage: send_error_telemetry 1
# Arguments:
# $1 - exit_code: The exit code that triggered the error
send_error_telemetry() {
local exit_code="${1:-1}"
local stage="${TELEMETRY_STAGE:-unknown}"
# Prevent duplicate sends
[[ "${TELEMETRY_ERROR_SENT:-0}" == "1" ]] && return 0
export TELEMETRY_ERROR_SENT=1
# Load environment (for SCARF_ANALYTICS check)
[[ -f "$ENV_FILE" ]] && source "$ENV_FILE" 2>/dev/null
# Opt-out check: respect SCARF_ANALYTICS and DO_NOT_TRACK
[[ "${SCARF_ANALYTICS:-true}" == "false" || "${DO_NOT_TRACK:-0}" == "1" ]] && return 0
# Determine event type based on stage prefix
local event_type="install_error"
[[ "$stage" == update_* || "$stage" == git_* ]] && event_type="update_error"
# Get installation ID and OS
local install_id
install_id=$(get_installation_id)
local os_type
os_type=$(get_os_type)
# Get version from VERSION file
local version="unknown"
[[ -f "$PROJECT_ROOT/VERSION" ]] && version=$(cat "$PROJECT_ROOT/VERSION" | tr -d '\n\r')
# Build URL with error-specific parameters
local url="${SCARF_ENDPOINT}?event=${event_type}&version=${version}&id=${install_id}&os=${os_type}&stage=${stage}&exit_code=${exit_code}"
# Send telemetry in background with short timeout (non-blocking, silent)
curl -sf --connect-timeout 2 --max-time 2 "$url" >/dev/null 2>&1 &
}
# Internal trap handler for ERR signal
_telemetry_error_handler() {
local exit_code=$?
# Only send if we actually have an error (exit_code > 0)
[[ $exit_code -gt 0 ]] && send_error_telemetry "$exit_code"
# Re-exit with the original code (important for set -e behavior)
exit $exit_code
}
# Setup global ERR trap for error telemetry
# Usage: setup_error_telemetry_trap
# Note: Only call this in top-level orchestrator scripts (install.sh, apply_update.sh, update.sh)
setup_error_telemetry_trap() {
trap '_telemetry_error_handler' ERR
}