#!bin/bash # Make non interactive export DEBIAN_FRONTEND=noninteractive echo "---------------------------------------------------" echo "Starting vLLM docker container" echo "---------------------------------------------------" # Define the model name and Hugging Face token MODEL_NAME="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8" HF_TOKEN="[insert-hf-token]" HF_TOKEN_ARG="--env HF_TOKEN=$HF_TOKEN" # Get the number of GPUs NUM_GPUS=$(nvidia-smi -L | wc -l) # Construct the Docker run command DOCKER_CMD="docker run -d --gpus all \ -v /ephemeral/.cache/huggingface:/root/.cache/huggingface \ -v /home/ubuntu/vllm:/vllm_repo \ -p 8000:8000 \ --ipc=host \ --restart always" \ # Append HF_TOKEN_ARG if it is not empty if [ -n "$HF_TOKEN_ARG" ]; then DOCKER_CMD="$DOCKER_CMD $HF_TOKEN_ARG" fi DOCKER_CMD="$DOCKER_CMD \ vllm/vllm-openai:latest \ --tensor-parallel-size $NUM_GPUS \ --model \"$MODEL_NAME\" \ --max_model_len 10000 " # Run the Docker command as ubuntu user echo "Executing Docker command: $DOCKER_CMD" sudo -u ubuntu bash -c "$DOCKER_CMD" # Test the API (wait +- 7 minutes for model download and start up) MODEL_NAME="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8" curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "'$MODEL_NAME'", "messages": [ { "role": "user", "content": "Hello, how are you?" } ] }'