mirror of
https://github.com/datalab-to/chandra.git
synced 2025-11-29 08:33:13 +00:00
62 lines
1.8 KiB
Python
62 lines
1.8 KiB
Python
import os
|
|
import subprocess
|
|
import sys
|
|
|
|
from chandra.settings import settings
|
|
|
|
# backend can be FLASH_ATTN
|
|
"""
|
|
sudo docker run --runtime nvidia --gpus all \
|
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
|
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
|
--env "VLLM_ATTENTION_BACKEND=TORCH_SDPA" \
|
|
-p 8000:8000 \
|
|
--ipc=host \
|
|
vllm/vllm-openai:latest \
|
|
--model datalab-to/chandra-0.2.4 \
|
|
--no-enforce-eager \
|
|
--max-num-seqs 32 \
|
|
--dtype bfloat16 \
|
|
--max-model-len 32768 \
|
|
--max_num_batched_tokens 65536 \
|
|
--gpu-memory-utilization .9 \
|
|
--served-model-name chandra
|
|
"""
|
|
|
|
def main():
|
|
cmd = [
|
|
"sudo",
|
|
"docker",
|
|
"run",
|
|
"--runtime", "nvidia",
|
|
"--gpus", f"device={settings.VLLM_GPUS}",
|
|
"-v", f"{os.path.expanduser('~')}/.cache/huggingface:/root/.cache/huggingface",
|
|
"--env", f"HUGGING_FACE_HUB_TOKEN={os.getenv('HF_TOKEN')}",
|
|
"--env", "VLLM_ATTENTION_BACKEND=TORCH_SDPA",
|
|
"-p", "8000:8000",
|
|
"--ipc=host",
|
|
"vllm/vllm-openai:latest",
|
|
"--model", settings.MODEL_CHECKPOINT,
|
|
"--no-enforce-eager",
|
|
"--max-num-seqs", "32",
|
|
"--dtype", "bfloat16",
|
|
"--max-model-len", "32768",
|
|
"--max_num_batched_tokens", "65536",
|
|
"--gpu-memory-utilization", ".9",
|
|
"--served-model-name", settings.VLLM_MODEL_NAME,
|
|
]
|
|
|
|
print(f"Starting vLLM server with command: {' '.join(cmd)}")
|
|
|
|
try:
|
|
# Use subprocess.run() which blocks and streams output automatically
|
|
subprocess.run(cmd, check=True)
|
|
except KeyboardInterrupt:
|
|
print("\nShutting down vLLM server...")
|
|
sys.exit(0)
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"\nvLLM server exited with error code {e.returncode}")
|
|
sys.exit(e.returncode)
|
|
|
|
if __name__ == "__main__":
|
|
main() |