NVIDIA · kaiyux · Aug 13, 2025 · Aug 1, 2025 · Aug 1, 2025 · Aug 1, 2025
@@ -38,14 +38,42 @@ container_image=${19}
 mounts=${20}
 workdir=${21}
 model_dir=${22}
+trtllm_repo=${23}
+
+echo "================= parameters ================="
+echo "num_ctx_servers: ${num_ctx_servers}"
+echo "ctx_tp_size: ${ctx_tp_size}"
+echo "ctx_batch_size: ${ctx_batch_size}"
+echo "ctx_max_num_tokens: ${ctx_max_num_tokens}"
+echo "ctx_enable_attention_dp: ${ctx_enable_attention_dp}"
+echo "num_gen_servers: ${num_gen_servers}"
+echo "gen_tp_size: ${gen_tp_size}"
+echo "gen_batch_size: ${gen_batch_size}"
+echo "gen_max_num_tokens: ${gen_max_num_tokens}"
+echo "gen_enable_attention_dp: ${gen_enable_attention_dp}"
+echo "gen_gpu_memory_fraction: ${gen_gpu_memory_fraction}"
+echo "eplb_num_slots: ${eplb_num_slots}"
+echo "mtp_size: ${mtp_size}"
+echo "concurrency: ${concurrency}"
+echo "isl: ${isl}"
+echo "osl: ${osl}"
+echo "multi_round: ${multi_round}"
+echo "streaming: ${streaming}"
+echo "container_image: ${container_image}"
+echo "mounts: ${mounts}"
+echo "workdir: ${workdir}"
+echo "model_dir: ${model_dir}"
+echo "trtllm_repo: ${trtllm_repo}"
+echo "==========================================="
+
 
 ctx_max_seq_len=$((isl + 1))
 gen_max_seq_len=$((isl + osl))
 ctx_gpu_frac=0.75
 cache_transceiver_max_num_tokens=8448
 
 container_name=disaggr
-logdir=${workdir}/benchmark-${isl}-${osl}/
+logdir=${workdir}/benchmark-${isl}-${osl}
 mkdir -p ${logdir}
 full_logdir=${logdir}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
 
@@ -65,16 +93,27 @@ fi
 mkdir -p ${full_logdir}
 echo "Log will be saved to: ${full_logdir}"
 
+if [ -z "${TRT_LLM_GIT_COMMIT}" ]; then
+    export TRT_LLM_GIT_COMMIT=$(git -C ${trtllm_repo} rev-parse --short HEAD 2>/dev/null || echo "unknown")
+    echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}"
+fi
+
 nsys_on=""
 # nsys_on=${full_logdir} # Uncomment this line to enable Nsys profiling
-
 # start the container
 srun -l --container-image=${container_image} \
         --container-name=${container_name} \
         --container-mounts=${mounts} \
         --mpi=pmix \
         echo "Container up."
 
+if [ -n "${trtllm_repo}" ]; then
+    srun --container-name=${container_name} \
+        --container-mounts=${mounts} \
+        --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
+        bash -c "cd ${trtllm_repo} && echo 'Running install operation...' && pip install -e .  " 2>&1 | tee ${full_logdir}/install.log
+fi
+
 # generate the yaml file
 srun -l --container-name=${container_name} \
         --container-mounts=${mounts} \
@@ -104,11 +143,12 @@ echo "YAML file generated."
 hostname_value=$(grep '^hostname:' ${full_logdir}/config.yaml | awk -F': ' '{print $2}')
 echo "server host name: $hostname_value"
 
+
 # start the workers
 srun -l --container-name=${container_name} \
         --container-mounts=${mounts} \
-    --mpi=pmix --overlap \
-    bash ${workdir}/start_worker.sh ${full_logdir}/config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} &> ${full_logdir}/output_workers.log &
+        --mpi=pmix --overlap \
+        bash ${workdir}/start_worker.sh ${full_logdir}/config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} &> ${full_logdir}/output_workers.log &
 
 # start the server
 srun -l --container-name=${container_name} \
@@ -121,7 +161,7 @@ srun -l --container-name=${container_name} \
 srun -l --container-name=${container_name} \
         --container-mounts=${mounts} \
         --mpi=pmix --overlap -N 1 -n 1 \
-        bash ${workdir}/run_benchmark.sh ${isl} ${osl} ${multi_round} ${model_dir} "${concurrency}" ${streaming} ${full_logdir}/ > ${full_logdir}/benchmark.log 2>&1
+        bash ${workdir}/run_benchmark.sh ${isl} ${osl} ${multi_round} ${model_dir} "${concurrency}" ${streaming} ${full_logdir} > ${full_logdir}/benchmark.log 2>&1
 
 # try to kill the server and workers
 srun -l --container-name=${container_name} \

@@ -16,7 +16,7 @@ isl=$1
 osl=$2
 multi_round=$3
 model_name=$4
-concurrency=$5
+concurrency_list=$5
 streaming=$6
 log_path=$7
 
@@ -89,31 +89,31 @@ do_get_logs(){
 }
 
 # run the loadgen
-
-mkdir -p ${log_path}/concurrency_${concurrency}
-cp ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}/workers_start.log
-max_count=$((${concurrency} * ${multi_round}))
-echo "Running loadgen with concurrency: ${concurrency}, max_count: ${max_count}"
-
-python -m tensorrt_llm.serve.scripts.benchmark_serving \
-    --model ${model_name} \
-    --tokenizer ${model_name} \
-    --dataset-name random \
-    --dataset-path ${shared_gpt_path} \
-    --random-input-len ${isl} \
-    --random-output-len ${osl} \
-    --random-prefix-len 0 \
-    --num-prompts ${max_count} \
-    --max-concurrency ${concurrency} \
-    --host ${hostname} \
-    --port ${port} \
-    --ignore-eos \
-    --no-test-input \
-    $(if [ "${streaming}" = "false" ]; then echo "--non-streaming"; fi)
-
-do_get_logs ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}
-# echo "" > ${log_path}/output_workers.log
-echo "done for ${concurrency} in folder ${log_path}/concurrency_${concurrency}"
+cp ${log_path}/output_workers.log ${log_path}/workers_start.log
+for concurrency in ${concurrency_list}; do
+    mkdir -p ${log_path}/concurrency_${concurrency}
+    max_count=$((${concurrency} * ${multi_round}))
+    echo "Running loadgen with concurrency: ${concurrency}, max_count: ${max_count}"
+    python -m tensorrt_llm.serve.scripts.benchmark_serving \
+        --model ${model_name} \
+        --tokenizer ${model_name} \
+        --dataset-name random \
+        --dataset-path ${shared_gpt_path} \
+        --random-input-len ${isl} \
+        --random-output-len ${osl} \
+        --random-prefix-len 0 \
+        --num-prompts ${max_count} \
+        --max-concurrency ${concurrency} \
+        --host ${hostname} \
+        --port ${port} \
+        --ignore-eos \
+        --no-test-input \
+        $(if [ "${streaming}" = "false" ]; then echo "--non-streaming"; fi)
+
+    do_get_logs ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}
+    echo "" > ${log_path}/output_workers.log
+    echo "done for ${concurrency} in folder ${log_path}/concurrency_${concurrency}"
+done
 
 echo "Benchmark done, gracefully shutting down server and workers..."
 kill -9 $(ps aux | grep '[s]tart_server.sh' | awk '{print $2}') >/dev/null 2>&1 || true

@@ -7,6 +7,7 @@ container_image=<container_image>
 mounts=<mounts>  # e.g. /mnt/data:/mnt/data
 workdir=<workdir>  # Path to disaggr_torch.slurm
 model_dir=<model_dir>  # Path to the model checkpoint
+repo_dir=<repo_dir>  # Path to the repo to install TensorRT-LLM, if this is empty, the pre-installed version will be used
 
 ntasks_per_node=4 # 4 GPUs per GB200 node
 total_node_num=8
@@ -31,6 +32,7 @@ args=(
     $mounts
     $workdir
     $model_dir
+    $repo_dir
 )
 
 # This command starts a job with 8 nodes, 32 GPUs in total.

@@ -9,6 +9,7 @@ container_image=<container_image>
 mounts=<mounts>  # e.g. /mnt/data:/mnt/data
 workdir=<workdir>  # Path to disaggr_torch.slurm
 model_dir=<model_dir>  # Path to the model checkpoint
+repo_dir=<repo_dir>  # Path to the repo to install TensorRT-LLM, if this is empty, the pre-installed version will be used
 
 mtp_size=0
 ntasks_per_node=4 # 4 GPUs per GB200 node
@@ -28,7 +29,7 @@ for b in 1 64 1024; do
 
         args=(
             ${ctx_num} 4 4 4480 true   # Context servers arguments
-            1 16 1024 1024 "0.7"       # Generation servers arguments
+            1 16 1024 1024 true "0.7"       # Generation servers arguments
             $eplb_num_slots $mtp_size  # Other arguments
             $concurrency               # Benchmarking arguments
             $isl
@@ -39,6 +40,7 @@ for b in 1 64 1024; do
             $mounts
             $workdir
             $model_dir
+            $repo_dir
         )
 
         sbatch --nodes=${total_node_num} \
@@ -74,6 +76,7 @@ for b in 512; do
         $mounts
         $workdir
         $model_dir
+        $repo_dir
     )
 
     sbatch --nodes=${total_node_num} \