chandra/scripts/start_vllm.py

import os
import subprocess
import sys

from chandra.settings import settings

# backend can be FLASH_ATTN
"""
sudo docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
    --env "VLLM_ATTENTION_BACKEND=TORCH_SDPA" \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:latest \
    --model datalab-to/chandra-0.2.4 \
    --no-enforce-eager \
    --max-num-seqs 32 \
    --dtype bfloat16 \
    --max-model-len 32768 \
    --max_num_batched_tokens 65536 \
    --gpu-memory-utilization .9 \
    --served-model-name chandra
"""

def main():
    cmd = [
        "sudo",
        "docker",
        "run",
        "--runtime", "nvidia",
        "--gpus", f"device={settings.VLLM_GPUS}",
        "-v", f"{os.path.expanduser('~')}/.cache/huggingface:/root/.cache/huggingface",
        "--env", f"HUGGING_FACE_HUB_TOKEN={os.getenv('HF_TOKEN')}",
        "--env", "VLLM_ATTENTION_BACKEND=TORCH_SDPA",
        "-p", "8000:8000",
        "--ipc=host",
        "vllm/vllm-openai:latest",
        "--model", settings.MODEL_CHECKPOINT,
        "--no-enforce-eager",
        "--max-num-seqs", "32",
        "--dtype", "bfloat16",
        "--max-model-len", "32768",
        "--max_num_batched_tokens", "65536",
        "--gpu-memory-utilization", ".9",
        "--served-model-name", settings.VLLM_MODEL_NAME,
    ]

    print(f"Starting vLLM server with command: {' '.join(cmd)}")

    try:
        # Use subprocess.run() which blocks and streams output automatically
        subprocess.run(cmd, check=True)
    except KeyboardInterrupt:
        print("\nShutting down vLLM server...")
        sys.exit(0)
    except subprocess.CalledProcessError as e:
        print(f"\nvLLM server exited with error code {e.returncode}")
        sys.exit(e.returncode)

if __name__ == "__main__":
    main()