feat: automate slurm handling in sglang example.

fsaady · fsaady · commit 51baf6105a60 · 2025-07-03T04:10:03.000-07:00
Signed-off-by: Fadi Saady &lt;fsaady@nvidia.com&gt;
diff --git a/examples/sglang/slurm_jobs/.gitignore b/examples/sglang/slurm_jobs/.gitignore
@@ -0,0 +1,2 @@
+logs/*
+outputs/*
diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md
@@ -0,0 +1,89 @@
+# SLURM Jobs for Dynamo Serve Benchmarking
+
+This folder contains SLURM job scripts designed to launch Dynamo Serve service on SLURM cluster nodes and monitor GPU activity. The primary purpose is to automate the process of starting prefill and decode nodes to enable running benchmarks.
+
+## Overview
+
+The scripts in this folder orchestrate the deployment of Dynamo Serve across multiple cluster nodes, with separate nodes handling prefill and decode operations. The system uses a Python-based job submission system with Jinja2 templates for flexible configuration.
+
+## Scripts
+
+- **`submit_job_script.py`**: Main script for generating and submitting SLURM job scripts from templates
+- **`job_script_template.j2`**: Jinja2 template for generating SLURM job scripts
+- **`scripts/worker_setup.py`**: Worker script that handles the actual Dynamo Serve setup on each node
+- **`scripts/monitor_gpu_utilization.sh`**: Script for monitoring GPU utilization during benchmarks
+
+## Logs Folder Structure
+
+Each SLURM job creates a unique log directory under `logs/` using the job ID. For example, job ID `3062824` creates the directory `logs/3062824/`.
+
+### Log File Structure
+
+```
+logs/
+├── 3062824/                    # Job ID directory
+│   ├── log.out                 # Main job output (node allocation, IP addresses, launch commands)
+│   ├── log.err                 # Main job errors
+│   ├── eos0197_prefill.out     # Prefill node stdout (eos0197)
+│   ├── eos0197_prefill.err     # Prefill node stderr (eos0197)
+│   ├── eos0200_prefill.out     # Prefill node stdout (eos0200)
+│   ├── eos0200_prefill.err     # Prefill node stderr (eos0200)
+│   ├── eos0201_decode.out      # Decode node stdout (eos0201)
+│   ├── eos0201_decode.err      # Decode node stderr (eos0201)
+│   ├── eos0204_decode.out      # Decode node stdout (eos0204)
+│   ├── eos0204_decode.err      # Decode node stderr (eos0204)
+│   ├── eos0197_prefill_gpu_utilization.log    # GPU utilization monitoring (eos0197)
+│   ├── eos0200_prefill_gpu_utilization.log    # GPU utilization monitoring (eos0200)
+│   ├── eos0201_decode_gpu_utilization.log     # GPU utilization monitoring (eos0201)
+│   └── eos0204_decode_gpu_utilization.log     # GPU utilization monitoring (eos0204)
+├── 3063137/                    # Another job ID directory
+├── 3062689/                    # Another job ID directory
+└── ...
+```
+
+## Usage
+
+1. **Submit a benchmark job**:
+   ```bash
+   python submit_job_script.py \
+     --template job_script_template.j2 \
+     --model-dir /path/to/model \
+     --config-dir /path/to/configs \
+     --container-image container-image-uri
+   ```
+
+   **Required arguments**:
+   - `--template`: Path to Jinja2 template file
+   - `--model-dir`: Model directory path
+   - `--config-dir`: Config directory path
+   - `--container-image`: Container image URI (e.g., `registry/repository:tag`)
+   - `--account`: SLURM account
+
+   **Optional arguments**:
+   - `--prefill-nodes`: Number of prefill nodes (default: `2`)
+   - `--decode-nodes`: Number of decode nodes (default: `2`)
+   - `--gpus-per-node`: Number of GPUs per node (default: `8`)
+   - `--network-interface`: Network interface to use (default: `eth3`)
+   - `--job-name`: SLURM job name (default: `dynamo_setup`)
+   - `--time-limit`: Time limit in HH:MM:SS format (default: `01:00:00`)
+
+   **Note**: The script automatically calculates the total number of nodes needed based on `--prefill-nodes` and `--decode-nodes` parameters.
+
+2. **Monitor job progress**:
+   ```bash
+   squeue -u $USER
+   ```
+
+3. **Check logs in real-time**:
+   ```bash
+   tail -f logs/{JOB_ID}/log.out
+   ```
+
+4. **Monitor GPU utilization**:
+   ```bash
+   tail -f logs/{JOB_ID}/{node}_prefill_gpu_utilization.log
+   ```
+
+## Outputs
+
+Benchmark results and outputs are stored in the `outputs/` directory, which is mounted into the container.
diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -0,0 +1,98 @@
+#!/bin/bash
+#SBATCH --job-name={{ job_name }}
+#SBATCH --nodes={{ total_nodes }}
+#SBATCH --ntasks={{ total_nodes }}
+#SBATCH --ntasks-per-node=1
+#SBATCH --account={{ account }}
+#SBATCH --time={{ time_limit }}
+#SBATCH --output=logs/%j/log.out
+#SBATCH --error=logs/%j/log.err
+
+# Constants
+PREFILL_NODES={{ prefill_nodes }}
+DECODE_NODES={{ decode_nodes }}
+TOTAL_NODES=$((PREFILL_NODES + DECODE_NODES))
+GPUS_PER_NODE={{ gpus_per_node }}
+LOG_DIR="${SLURM_SUBMIT_DIR}/logs/${SLURM_JOB_ID}/"
+SCRIPT_DIR="${SLURM_SUBMIT_DIR}/scripts"
+OUTPUT_DIR="${SLURM_SUBMIT_DIR}/outputs"
+MODEL_DIR="{{ model_dir }}"
+CONFIG_DIR="{{ config_dir }}"
+CONTAINER_IMAGE="{{ container_image }}"
+NETWORK_INTERFACE="{{ network_interface }}"
+
+{% raw %}
+
+mkdir -p "${OUTPUT_DIR}" "${LOG_DIR}"
+
+nodes=($(scontrol show hostnames $SLURM_NODELIST))
+if [ ${#nodes[@]} -ne $TOTAL_NODES ]; then
+    echo "Error: Expected $TOTAL_NODES nodes but got ${#nodes[@]} nodes"
+    exit 1
+fi
+
+# Print node information
+for i in "${!nodes[@]}"; do
+    echo "Node $i: ${nodes[$i]}"
+done
+
+PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+')
+if [ -z "$PREFILL_HOST_IP" ]; then
+    echo "Error: Could not retrieve IP address for prefill host ${nodes[0]} on interface $NETWORK_INTERFACE"
+    exit 1
+fi
+echo "Prefill host IP address: $PREFILL_HOST_IP"
+
+DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+')
+if [ -z "$DECODE_HOST_IP" ]; then
+    echo "Error: Could not retrieve IP address for decode host ${nodes[$PREFILL_NODES]} on interface $NETWORK_INTERFACE"
+    exit 1
+fi
+echo "Decode host IP address: $DECODE_HOST_IP"
+
+# Prepare enroot arguments to pass to srun commands
+ENROOT_ARGS="\
+    --container-image=${CONTAINER_IMAGE} \
+    --no-container-entrypoint \
+    --container-mount-home \
+    --no-container-remap-root \
+    --container-mounts=${MODEL_DIR}:/model/,${CONFIG_DIR}:/configs/,${SCRIPT_DIR}:/scripts/,${OUTPUT_DIR}:/outputs/,${LOG_DIR}:/logs/ \
+"
+
+# Launch prefill tasks on the first PREFILL_NODES nodes
+for i in $(seq 0 $((PREFILL_NODES - 1))); do
+    node=${nodes[$i]}
+    rank=$i
+    echo "Launching prefill task on node ${i} (rank ${rank}): $node"
+    echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err"
+    echo "Command: python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log &"
+    srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \
+    --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err \
+    python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log &
+done
+
+# Launch decode tasks on the next DECODE_NODES nodes
+for i in $(seq $PREFILL_NODES $((PREFILL_NODES + DECODE_NODES - 1))); do
+    node=${nodes[$i]}
+    rank=$((i - PREFILL_NODES))
+    echo "Launching decode task on node ${i} (rank ${rank}): $node"
+    echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err"
+    echo "Command: python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log &"
+    srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \
+    --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err \
+    python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log &
+done
+
+echo ""
+echo "To connect to the host prefill node:"
+echo "srun $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --overlap --pty bash"
+
+echo ""
+echo "Make sure to cancel the job at the end:"
+echo "scancel $SLURM_JOB_ID"
+
+# Wait for all tasks to complete
+wait
+echo "Script finished at $(date)"
+
+{% endraw %}
diff --git a/examples/sglang/slurm_jobs/scripts/monitor_gpu_utilization.sh b/examples/sglang/slurm_jobs/scripts/monitor_gpu_utilization.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Usage: ./monitor_gpu_utilization.sh [interval_seconds]
+
+# Default interval is 2 seconds
+INTERVAL=${1:-2}
+
+# Check if nvidia-smi is available
+if ! command -v nvidia-smi &> /dev/null; then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') Error: nvidia-smi not found"
+    exit 1
+fi
+
+echo "Starting GPU utilization monitoring (checking every ${INTERVAL}s, printing only on changes)..."
+
+PREV_UTILIZATION=""
+while true; do
+    CURRENT_UTILIZATION=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,nounits | paste -sd ' ' -)
+    if [ $? -ne 0 ]; then
+        echo "$(date '+%Y-%m-%d %H:%M:%S') Error: nvidia-smi command failed"
+    else
+        if [ "$CURRENT_UTILIZATION" != "$PREV_UTILIZATION" ]; then
+            echo "$(date '+%Y-%m-%d %H:%M:%S') GPU Utilization: $CURRENT_UTILIZATION"
+            PREV_UTILIZATION="$CURRENT_UTILIZATION"
+        fi
+    fi
+
+    sleep $INTERVAL
+done
diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
diff --git a/examples/sglang/slurm_jobs/submit_job_script.py b/examples/sglang/slurm_jobs/submit_job_script.py