# ------------------------------------------------------------------------------------------------------------------------------ # 1. Install conda # see: https://docs.anaconda.com/miniconda/miniconda-install/ # ------------------------------------------------------------------------------------------------------------------------------ wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh sudo chmod +x Miniconda3-latest-Linux-x86_64.sh ./Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda3 export PATH="$HOME/miniconda3/bin:$PATH" rm Miniconda3-latest-Linux-x86_64.sh conda init bash source ~/.bashrc # ------------------------------------------------------------------------------------------------------------------------------ # 2. Install llama stack # see: https://github.com/meta-llama/llama-stack?tab=readme-ov-file#installation) # ------------------------------------------------------------------------------------------------------------------------------ HF_TOKEN="[INSERT YOUR HUGGINGFACE TOKEN HERE]" MODEL_NAME="Llama3.2-11B-Vision-Instruct" # uncomment this line to use the 90B model # You need 8 GPUs, llama stack does not support less, see also: https://github.com/meta-llama/llama-stack/blob/eb2d8a31a5927589197c794855d7323f8f4700bc/llama_stack/providers/impls/meta_reference/inference/config.py#L40 # MODEL_NAME="Llama3.2-90B-Vision-Instruct" BASE_NAME="llama3-2" MAX_SEQ_LEN=4096 MAX_BATCH_SIZE=1 mkdir -p /home/ubuntu/local cd /home/ubuntu/local git clone https://github.com/meta-llama/llama-stack.git conda create -n stack python=3.10 -y conda activate stack cd llama-stack $CONDA_PREFIX/bin/pip install -e . # Download model to ephemeral storage so we can store more models # Note: after restarting the instance from hibernation, we need to re-download the model export LLAMA_STACK_CONFIG_DIR=/ephemeral/.llama sudo chown -R ubuntu:ubuntu /ephemeral llama download --source huggingface --model-id $MODEL_NAME --hf-token $HF_TOKEN # Build stack # llama stack build --template local --name $BASE_NAME # Using pty to simulate a terminal sudo python3 -c " import pty; pty.spawn(['/bin/bash', '-c', 'source /miniconda3/etc/profile.d/conda.sh && \ conda activate /miniconda3/envs/stack && \ export LLAMA_STACK_CONFIG_DIR=/ephemeral/.llama && \ /usr/local/bin/llama stack build --template local --name $BASE_NAME'])" # Configure (interactively) # llama stack configure "$BASE_NAME" # Configure (non-interactively) CONFIG_YML_PATH=/ephemeral/.llama/builds/conda/$BASE_NAME-run.yaml mkdir -p $(dirname $CONFIG_YML_PATH) sudo cat < $CONFIG_YML_PATH built_at: '2024-09-30T10:40:07.669022' image_name: $BASE_NAME docker_image: null conda_env: $BASE_NAME apis_to_serve: - shields - safety - agents - memory_banks - models - inference - memory api_providers: inference: providers: - meta-reference memory: providers: - meta-reference safety: providers: - meta-reference agents: provider_id: meta-reference config: persistence_store: namespace: null type: sqlite db_path: /home/ubuntu/.llama/runtime/kvstore.db telemetry: provider_id: meta-reference config: {} routing_table: inference: - provider_id: meta-reference config: model: $MODEL_NAME quantization: null torch_seed: 1 max_seq_len: 4096 max_batch_size: 1 routing_key: $MODEL_NAME memory: - provider_id: meta-reference config: {} routing_key: vector safety: - provider_id: meta-reference config: llama_guard_shield: null prompt_guard_shield: null routing_key: llama_guard - provider_id: meta-reference config: llama_guard_shield: null prompt_guard_shield: null routing_key: code_scanner_guard - provider_id: meta-reference config: llama_guard_shield: null prompt_guard_shield: null routing_key: injection_shield - provider_id: meta-reference config: llama_guard_shield: null prompt_guard_shield: null routing_key: jailbreak_shield EOF # Make sure Ubuntu user has access to all directories sudo chown -R ubuntu:ubuntu /home/ubuntu # Run llama stack echo "Running llama stack run for model $MODEL_NAME" # Using pty to simulate a terminal sudo python3 -c " import pty; pty.spawn(['/bin/bash', '-c', 'source /miniconda3/etc/profile.d/conda.sh && \ conda activate /miniconda3/envs/stack && \ export LLAMA_STACK_CONFIG_DIR=/ephemeral/.llama && \ /usr/local/bin/llama stack run $BASE_NAME --port 8000'])" # Try out the API (image) # Download the image IMAGE_URL="https://www.hyperstack.cloud/hs-fs/hubfs/deploy-vm-11-ecd8c53003182041d3a2881d0010f6c6-1.png?width=3352&height=1852&name=deploy-vm-11-ecd8c53003182041d3a2881d0010f6c6-1.png" IMAGE_EXTENSION=$(echo "$IMAGE_URL" | awk -F. '{print $NF}' | cut -d'?' -f1) FILE_NAME="/home/ubuntu/downloaded_image.$IMAGE_EXTENSION" curl -o $FILE_NAME $IMAGE_URL # Write the JSON payload to payload.json file cat < payload.json { "model": "Llama3.2-11B-Vision-Instruct", "messages": [ { "role": "user", "content": [ { "image": { "uri": "file://$FILE_NAME" } }, "Describe this image in two sentences" ] } ] } EOF # Use the JSON payload file in the curl command curl -X POST http://localhost:8000/inference/chat_completion \ -H "Content-Type: application/json" \ -d @payload.json # Try out the API (text) # curl -X POST http://localhost:8000/inference/chat_completion \ # -H "Content-Type: application/json" \ # -d '{ # "model": "'Llama3.2-11B-Vision-Instruct'", # "messages": [ # { # "role": "user", # "content": "Hello, how are you?" # } # ] # }'