#!bin/bash # Make non interactive export DEBIAN_FRONTEND=noninteractive echo "---------------------------------------------------" echo "Starting vLLM docker container" echo "---------------------------------------------------" # Construct the Docker run command MODEL_NAME="Qwen/QwQ-32B" MAX_MODEL_LEN=12576 NUM_GPUS=$(nvidia-smi -L | wc -l) DOCKER_CMD="docker run -d --gpus all \ -v /ephemeral/.cache/huggingface:/root/.cache/huggingface \ -v /home/ubuntu/vllm:/vllm_repo \ -p 8000:8000 \ --ipc=host \ --restart always" DOCKER_CMD="$DOCKER_CMD \ vllm/vllm-openai:latest \ --tensor-parallel-size $NUM_GPUS \ --model \"$MODEL_NAME\" \ --max_model_len $MAX_MODEL_LEN" # Run the Docker command as ubuntu user echo "Executing Docker command: $DOCKER_CMD" sudo -u ubuntu bash -c "$DOCKER_CMD" # Test the API (wait +- 7 minutes for model download and start up) # MODEL_NAME="Qwen/QwQ-32B" # curl -X POST http://localhost:8000/v1/chat/completions \ # -H "Content-Type: application/json" \ # -d '{ # "model": "'$MODEL_NAME'", # "messages": [ # { # "role": "user", # "content": "Hi, how to write a Python function that prints \"Hyperstack is the greatest GPU Cloud platform\"" # } # ] # }'