Files
chandra/scripts/start_vllm.py
Vik Paruchuri a9ffa789c6 Refactor
2025-10-15 16:06:57 -04:00

62 lines
1.8 KiB
Python

import os
import subprocess
import sys
from chandra.settings import settings
# backend can be FLASH_ATTN
"""
sudo docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
--env "VLLM_ATTENTION_BACKEND=TORCH_SDPA" \
-p 8000:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--model datalab-to/chandra-0.2.4 \
--no-enforce-eager \
--max-num-seqs 32 \
--dtype bfloat16 \
--max-model-len 32768 \
--max_num_batched_tokens 65536 \
--gpu-memory-utilization .9 \
--served-model-name chandra
"""
def main():
cmd = [
"sudo",
"docker",
"run",
"--runtime", "nvidia",
"--gpus", f"device={settings.VLLM_GPUS}",
"-v", f"{os.path.expanduser('~')}/.cache/huggingface:/root/.cache/huggingface",
"--env", f"HUGGING_FACE_HUB_TOKEN={os.getenv('HF_TOKEN')}",
"--env", "VLLM_ATTENTION_BACKEND=TORCH_SDPA",
"-p", "8000:8000",
"--ipc=host",
"vllm/vllm-openai:latest",
"--model", settings.MODEL_CHECKPOINT,
"--no-enforce-eager",
"--max-num-seqs", "32",
"--dtype", "bfloat16",
"--max-model-len", "32768",
"--max_num_batched_tokens", "65536",
"--gpu-memory-utilization", ".9",
"--served-model-name", settings.VLLM_MODEL_NAME,
]
print(f"Starting vLLM server with command: {' '.join(cmd)}")
try:
# Use subprocess.run() which blocks and streams output automatically
subprocess.run(cmd, check=True)
except KeyboardInterrupt:
print("\nShutting down vLLM server...")
sys.exit(0)
except subprocess.CalledProcessError as e:
print(f"\nvLLM server exited with error code {e.returncode}")
sys.exit(e.returncode)
if __name__ == "__main__":
main()