#!bin/bash

# Make non interactive
export DEBIAN_FRONTEND=noninteractive

echo "---------------------------------------------------"
echo "Starting vLLM docker container"
echo "---------------------------------------------------"

# Define the model name and Hugging Face token
MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
HF_TOKEN="[insert-token-here]"
HF_TOKEN_ARG="--env HF_TOKEN=$HF_TOKEN"

# Get the number of GPUs
NUM_GPUS=$(nvidia-smi -L | wc -l)

# Construct the Docker run command
DOCKER_CMD="docker run -d --gpus all \
    -v /ephemeral/.cache/huggingface:/root/.cache/huggingface \
    -v /home/ubuntu/vllm:/vllm_repo \
    -p 8000:8000 \
    --ipc=host \
    --restart always" \

# Append HF_TOKEN_ARG if it is not empty
if [ -n "$HF_TOKEN_ARG" ]; then
    DOCKER_CMD="$DOCKER_CMD $HF_TOKEN_ARG"
fi

DOCKER_CMD="$DOCKER_CMD \
     vllm/vllm-openai:latest \
    --tensor-parallel-size $NUM_GPUS \
    --model \"$MODEL_NAME\" \
    --max_model_len 23000 
    "

# Run the Docker command as ubuntu user
echo "Executing Docker command: $DOCKER_CMD"
sudo -u ubuntu bash -c "$DOCKER_CMD"

# Test the API (wait +- 7 minutes for model download and start up)
MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
curl -X POST http://localhost:8000/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "'$MODEL_NAME'",
        "messages": [
            {
                "role": "user",
                "content": "Hello, how are you?"
            }
        ]
    }'