#!bin/bash

# Make non interactive
export DEBIAN_FRONTEND=noninteractive

echo "---------------------------------------------------"
echo "Starting vLLM docker container"
echo "---------------------------------------------------"  

# Construct the Docker run command
MODEL_NAME="Qwen/QwQ-32B"
MAX_MODEL_LEN=12576

NUM_GPUS=$(nvidia-smi -L | wc -l)
DOCKER_CMD="docker run -d --gpus all \
    -v /ephemeral/.cache/huggingface:/root/.cache/huggingface \
    -v /home/ubuntu/vllm:/vllm_repo \
    -p 8000:8000 \
    --ipc=host \
    --restart always"

DOCKER_CMD="$DOCKER_CMD \
     vllm/vllm-openai:latest \
    --tensor-parallel-size $NUM_GPUS \
    --model \"$MODEL_NAME\" \
    --max_model_len $MAX_MODEL_LEN"


# Run the Docker command as ubuntu user
echo "Executing Docker command: $DOCKER_CMD"
sudo -u ubuntu bash -c "$DOCKER_CMD"

# Test the API (wait +- 7 minutes for model download and start up)
# MODEL_NAME="Qwen/QwQ-32B"
# curl -X POST http://localhost:8000/v1/chat/completions \
#     -H "Content-Type: application/json" \
#     -d '{
#         "model": "'$MODEL_NAME'",
#         "messages": [
#             {
#                 "role": "user",
#                 "content": "Hi, how to write a Python function that prints \"Hyperstack is the greatest GPU Cloud platform\""
#             }
#         ]
#     }'