NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp‎
Lines changed: 5 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md‎
Lines changed: 77 additions & 1 deletion b/‎docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md‎
Lines changed: 77 additions & 1 deletion
diff --git a/‎examples/disaggregated/slurm/disaggr_torch.slurm‎
Lines changed: 45 additions & 5 deletions b/‎examples/disaggregated/slurm/disaggr_torch.slurm‎
Lines changed: 45 additions & 5 deletions
diff --git a/‎examples/disaggregated/slurm/run_benchmark.sh‎
Lines changed: 26 additions & 26 deletions b/‎examples/disaggregated/slurm/run_benchmark.sh‎
Lines changed: 26 additions & 26 deletions
diff --git a/‎examples/disaggregated/slurm/submit.sh‎
Lines changed: 2 additions & 0 deletions b/‎examples/disaggregated/slurm/submit.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/quantization/quantize_mixed_precision_moe.py‎
Lines changed: 25 additions & 13 deletions b/‎examples/quantization/quantize_mixed_precision_moe.py‎
Lines changed: 25 additions & 13 deletions
diff --git a/‎examples/wide_ep/slurm_scripts/submit.sh‎
Lines changed: 4 additions & 1 deletion b/‎examples/wide_ep/slurm_scripts/submit.sh‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py‎
Lines changed: 3 additions & 5 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py‎
Lines changed: 3 additions & 5 deletions
@@ -297,6 +297,11 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams)
         = mFixedParams.isSPadded ? runnerParams.b * runnerParams.qSeqLen : runnerParams.totalQSeqLen;
     mLaunchParams.total_kv_seqlen
         = mFixedParams.isSPadded ? runnerParams.b * runnerParams.kvSeqLen : runnerParams.totalKvSeqLen;
+    // Workaround for nvbug 5412456: total_kv_seqlen fallbacks to total_q_seqlen if it's zero.
+    if (mLaunchParams.total_kv_seqlen == 0)
+    {
+        mLaunchParams.total_kv_seqlen = mLaunchParams.total_q_seqlen;
+    }
 
     TLLM_CHECK_WITH_INFO(mFixedParams.headSize > 0, "Head size should be greater than 0.");
     // Pad head size to next power of 2.
 
@@ -3,11 +3,12 @@
 TensorRT-LLM provides the OpenAI-compatiable API via `trtllm-serve` command.
 A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference).
 
-This step-by-step tutorial covers the following topics for running online serving benchmarking with Llama 3.1 70B:
+This step-by-step tutorial covers the following topics for running online serving benchmarking with Llama 3.1 70B and Qwen2.5-VL-7B for multimodal models:
  * Methodology Introduction
  * Launch the OpenAI-Compatibale Server with NGC container
  * Run the performance benchmark
  * Using `extra_llm_api_options`
+ * Multimodal Serving and Benchmarking
 
 
 ## Methodology Introduction
@@ -220,3 +221,78 @@ The following is a list of common performance switches.
 &emsp;**Default**: TRTLLM
 
 See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the extra\_llm\_api\_options`.`
+
+## Multimodal Serving and Benchmarking
+
+TensorRT-LLM supports multimodal models for both serving and benchmarking. This section covers how to set up multimodal serving and run benchmarks for multimodal models.
+
+### Setting up Multimodal Serving
+
+Here's an example of setting up multimodal serving with Qwen2.5-VL:
+
+```bash
+#!/bin/bash
+model_path=/path/to/qwen2.5vl-7B_model
+
+trtllm-serve ${model_path} \
+    --max_batch_size 64 \
+    --max_num_tokens 8192 \
+    --max_seq_len 4096 \
+    --kv_cache_free_gpu_memory_fraction 0.9 \
+    --tp_size 1 \
+    --ep_size 1 \
+    --trust_remote_code
+```
+
+### Multimodal Benchmarking
+
+For multimodal serving benchmarks, you can use the `benchmark_serving.py` script with multimodal datasets:
+
+```bash
+python -m tensorrt_llm.serve.scripts.benchmark_serving \
+    --model ${model_path} \
+    --backend openai-chat \
+    --dataset-name "random_image" \
+    --random-input-len 128 \
+    --random-output-len 128 \
+    --random-image-width 512 \
+    --random-image-height 512 \
+    --random-num-images 1 \
+    --num-prompts 100 \
+    --max-concurrency 8 \
+    --ignore-eos
+```
+
+Below is some example TensorRT-LLM serving benchmark output. Your actual results may vary.
+```
+============ Serving Benchmark Result ============
+Successful requests:                     1         
+Benchmark duration (s):                  0.83      
+Total input tokens:                      128       
+Total generated tokens:                  128       
+Request throughput (req/s):              1.20      
+Output token throughput (tok/s):         153.92    
+Total Token throughput (tok/s):          307.85    
+User throughput (tok/s):                 154.15    
+Mean Request AR:                         0.9845    
+Median Request AR:                       0.9845    
+---------------Time to First Token----------------
+Mean TTFT (ms):                          84.03     
+Median TTFT (ms):                        84.03     
+P99 TTFT (ms):                           84.03     
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          5.88      
+Median TPOT (ms):                        5.88      
+P99 TPOT (ms):                           5.88      
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           5.83      
+Median ITL (ms):                         5.88      
+P99 ITL (ms):                            6.14      
+==================================================
+```
+
+**Notes for Multimodal Benchmarking:**
+- Set `--backend` as `openai-chat` since multimodal models are only supported on the chat API and require a chat template
+- Control the number of images per request with `--random-num-images`
+- Use `--random-image-width` and `--random-image-height` to specify image dimensions or `--random-image-size` for squared image dimensions.
+- The `random_image` dataset generates synthetic images for benchmarking
@@ -38,14 +38,42 @@ container_image=${19}
 mounts=${20}
 workdir=${21}
 model_dir=${22}
+trtllm_repo=${23}
+
+echo "================= parameters ================="
+echo "num_ctx_servers: ${num_ctx_servers}"
+echo "ctx_tp_size: ${ctx_tp_size}"
+echo "ctx_batch_size: ${ctx_batch_size}"
+echo "ctx_max_num_tokens: ${ctx_max_num_tokens}"
+echo "ctx_enable_attention_dp: ${ctx_enable_attention_dp}"
+echo "num_gen_servers: ${num_gen_servers}"
+echo "gen_tp_size: ${gen_tp_size}"
+echo "gen_batch_size: ${gen_batch_size}"
+echo "gen_max_num_tokens: ${gen_max_num_tokens}"
+echo "gen_enable_attention_dp: ${gen_enable_attention_dp}"
+echo "gen_gpu_memory_fraction: ${gen_gpu_memory_fraction}"
+echo "eplb_num_slots: ${eplb_num_slots}"
+echo "mtp_size: ${mtp_size}"
+echo "concurrency: ${concurrency}"
+echo "isl: ${isl}"
+echo "osl: ${osl}"
+echo "multi_round: ${multi_round}"
+echo "streaming: ${streaming}"
+echo "container_image: ${container_image}"
+echo "mounts: ${mounts}"
+echo "workdir: ${workdir}"
+echo "model_dir: ${model_dir}"
+echo "trtllm_repo: ${trtllm_repo}"
+echo "==========================================="
+
 
 ctx_max_seq_len=$((isl + 1))
 gen_max_seq_len=$((isl + osl))
 ctx_gpu_frac=0.75
 cache_transceiver_max_num_tokens=8448
 
 container_name=disaggr
-logdir=${workdir}/benchmark-${isl}-${osl}/
+logdir=${workdir}/benchmark-${isl}-${osl}
 mkdir -p ${logdir}
 full_logdir=${logdir}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
 
@@ -65,16 +93,27 @@ fi
 mkdir -p ${full_logdir}
 echo "Log will be saved to: ${full_logdir}"
 
+if [ -z "${TRT_LLM_GIT_COMMIT}" ]; then
+    export TRT_LLM_GIT_COMMIT=$(git -C ${trtllm_repo} rev-parse --short HEAD 2>/dev/null || echo "unknown")
+    echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}"
+fi
+
 nsys_on=""
 # nsys_on=${full_logdir} # Uncomment this line to enable Nsys profiling
-
 # start the container
 srun -l --container-image=${container_image} \
         --container-name=${container_name} \
         --container-mounts=${mounts} \
         --mpi=pmix \
         echo "Container up."
 
+if [ -n "${trtllm_repo}" ]; then
+    srun --container-name=${container_name} \
+        --container-mounts=${mounts} \
+        --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
+        bash -c "cd ${trtllm_repo} && echo 'Running install operation...' && pip install -e .  " 2>&1 | tee ${full_logdir}/install.log
+fi
+
 # generate the yaml file
 srun -l --container-name=${container_name} \
         --container-mounts=${mounts} \
@@ -104,11 +143,12 @@ echo "YAML file generated."
 hostname_value=$(grep '^hostname:' ${full_logdir}/config.yaml | awk -F': ' '{print $2}')
 echo "server host name: $hostname_value"
 
+
 # start the workers
 srun -l --container-name=${container_name} \
         --container-mounts=${mounts} \
-    --mpi=pmix --overlap \
-    bash ${workdir}/start_worker.sh ${full_logdir}/config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} &> ${full_logdir}/output_workers.log &
+        --mpi=pmix --overlap \
+        bash ${workdir}/start_worker.sh ${full_logdir}/config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} &> ${full_logdir}/output_workers.log &
 
 # start the server
 srun -l --container-name=${container_name} \
@@ -121,7 +161,7 @@ srun -l --container-name=${container_name} \
 srun -l --container-name=${container_name} \
         --container-mounts=${mounts} \
         --mpi=pmix --overlap -N 1 -n 1 \
-        bash ${workdir}/run_benchmark.sh ${isl} ${osl} ${multi_round} ${model_dir} "${concurrency}" ${streaming} ${full_logdir}/ > ${full_logdir}/benchmark.log 2>&1
+        bash ${workdir}/run_benchmark.sh ${isl} ${osl} ${multi_round} ${model_dir} "${concurrency}" ${streaming} ${full_logdir} > ${full_logdir}/benchmark.log 2>&1
 
 # try to kill the server and workers
 srun -l --container-name=${container_name} \
 
@@ -16,7 +16,7 @@ isl=$1
 osl=$2
 multi_round=$3
 model_name=$4
-concurrency=$5
+concurrency_list=$5
 streaming=$6
 log_path=$7
 
@@ -89,31 +89,31 @@ do_get_logs(){
 }
 
 # run the loadgen
-
-mkdir -p ${log_path}/concurrency_${concurrency}
-cp ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}/workers_start.log
-max_count=$((${concurrency} * ${multi_round}))
-echo "Running loadgen with concurrency: ${concurrency}, max_count: ${max_count}"
-
-python -m tensorrt_llm.serve.scripts.benchmark_serving \
-    --model ${model_name} \
-    --tokenizer ${model_name} \
-    --dataset-name random \
-    --dataset-path ${shared_gpt_path} \
-    --random-input-len ${isl} \
-    --random-output-len ${osl} \
-    --random-prefix-len 0 \
-    --num-prompts ${max_count} \
-    --max-concurrency ${concurrency} \
-    --host ${hostname} \
-    --port ${port} \
-    --ignore-eos \
-    --no-test-input \
-    $(if [ "${streaming}" = "false" ]; then echo "--non-streaming"; fi)
-
-do_get_logs ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}
-# echo "" > ${log_path}/output_workers.log
-echo "done for ${concurrency} in folder ${log_path}/concurrency_${concurrency}"
+cp ${log_path}/output_workers.log ${log_path}/workers_start.log
+for concurrency in ${concurrency_list}; do
+    mkdir -p ${log_path}/concurrency_${concurrency}
+    max_count=$((${concurrency} * ${multi_round}))
+    echo "Running loadgen with concurrency: ${concurrency}, max_count: ${max_count}"
+    python -m tensorrt_llm.serve.scripts.benchmark_serving \
+        --model ${model_name} \
+        --tokenizer ${model_name} \
+        --dataset-name random \
+        --dataset-path ${shared_gpt_path} \
+        --random-input-len ${isl} \
+        --random-output-len ${osl} \
+        --random-prefix-len 0 \
+        --num-prompts ${max_count} \
+        --max-concurrency ${concurrency} \
+        --host ${hostname} \
+        --port ${port} \
+        --ignore-eos \
+        --no-test-input \
+        $(if [ "${streaming}" = "false" ]; then echo "--non-streaming"; fi)
+
+    do_get_logs ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}
+    echo "" > ${log_path}/output_workers.log
+    echo "done for ${concurrency} in folder ${log_path}/concurrency_${concurrency}"
+done
 
 echo "Benchmark done, gracefully shutting down server and workers..."
 kill -9 $(ps aux | grep '[s]tart_server.sh' | awk '{print $2}') >/dev/null 2>&1 || true
 
@@ -7,6 +7,7 @@ container_image=<container_image>
 mounts=<mounts>  # e.g. /mnt/data:/mnt/data
 workdir=<workdir>  # Path to disaggr_torch.slurm
 model_dir=<model_dir>  # Path to the model checkpoint
+repo_dir=<repo_dir>  # Path to the repo to install TensorRT-LLM, if this is empty, the pre-installed version will be used
 
 ntasks_per_node=4 # 4 GPUs per GB200 node
 total_node_num=8
@@ -31,6 +32,7 @@ args=(
     $mounts
     $workdir
     $model_dir
+    $repo_dir
 )
 
 # This command starts a job with 8 nodes, 32 GPUs in total.
 
@@ -45,10 +45,16 @@ def load_and_preprocess_state_dict(modelopt_state_root, world_size=8):
     state_dict_list = []
     # load amax from state dict
     for rank in range(world_size):
-        state_dict_list.append(
-            torch.load(
-                f"{modelopt_state_root}/amax_dict_rank{rank}-mp{world_size}.pt",
-                map_location="cuda:0"))
+        amax_file = f"{modelopt_state_root}/amax_dict_rank{rank}-mp{world_size}.pt"
+        if os.path.exists(amax_file):
+            state_dict_list.append(torch.load(amax_file, map_location="cuda:0"))
+        else:
+            print(f"WARNING: amax file not found: {amax_file}")
+
+    if not state_dict_list:
+        print("ERROR: No amax files loaded!")
+        return {}
+
     # calculate the max across all TP ranks
     merged_state_dict = state_dict_list[0]
     for rank in range(world_size):
@@ -232,15 +238,18 @@ def get_file_name(layer):
                 continue
             new_safetensors.update({key: get_tensor(key)})
 
+    # Process activation scales for all ranks
+    if os.path.isdir(args.act_scales):
+        # Extract activation scales
+        renamed_state_dict = load_and_preprocess_state_dict(
+            modelopt_state_root=args.act_scales, world_size=8)
+        scales = get_scales_from_amax(start_layer=start_layer,
+                                      end_layer=end_layer,
+                                      renamed_state_dict=renamed_state_dict)
+        new_safetensors.update(scales)
+
     if args.rank == 0:
-        if os.path.isdir(args.act_scales):
-            # Extract activation scales
-            renamed_state_dict = load_and_preprocess_state_dict(
-                modelopt_state_root=args.act_scales, world_size=8)
-            get_scales_from_amax(start_layer=start_layer,
-                                 end_layer=end_layer,
-                                 renamed_state_dict=renamed_state_dict)
-        else:
+        if not os.path.isdir(args.act_scales):
             input_scales = safe_open(args.act_scales, "pt")
             for k in input_scales.keys():
                 new_safetensors.update({k: input_scales.get_tensor(k)})
@@ -259,7 +268,10 @@ def get_file_name(layer):
         ]
         for name in names:
             shutil.copy(os.path.join(model_dir, name), output_dir)
-        shutil.copy(args.act_scales, output_dir)
+        if os.path.isdir(args.act_scales):
+            shutil.copytree(args.act_scales, output_dir, dirs_exist_ok=True)
+        else:
+            shutil.copy(args.act_scales, output_dir)
 
         # config.json
         del config['quantization_config']
 
@@ -9,6 +9,7 @@ container_image=<container_image>
 mounts=<mounts>  # e.g. /mnt/data:/mnt/data
 workdir=<workdir>  # Path to disaggr_torch.slurm
 model_dir=<model_dir>  # Path to the model checkpoint
+repo_dir=<repo_dir>  # Path to the repo to install TensorRT-LLM, if this is empty, the pre-installed version will be used
 
 mtp_size=0
 ntasks_per_node=4 # 4 GPUs per GB200 node
@@ -28,7 +29,7 @@ for b in 1 64 1024; do
 
         args=(
             ${ctx_num} 4 4 4480 true   # Context servers arguments
-            1 16 1024 1024 "0.7"       # Generation servers arguments
+            1 16 1024 1024 true "0.7"       # Generation servers arguments
             $eplb_num_slots $mtp_size  # Other arguments
             $concurrency               # Benchmarking arguments
             $isl
@@ -39,6 +40,7 @@ for b in 1 64 1024; do
             $mounts
             $workdir
             $model_dir
+            $repo_dir
         )
 
         sbatch --nodes=${total_node_num} \
@@ -74,6 +76,7 @@ for b in 512; do
         $mounts
         $workdir
         $model_dir
+        $repo_dir
     )
 
     sbatch --nodes=${total_node_num} \
 
@@ -110,13 +110,11 @@ def __init__(
         assert len(
             self.initial_local_expert_ids) == self.expert_size_per_partition
 
-        max_num_tokens = model_config.max_num_tokens
         # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
-        if self.use_dp:
-            max_num_tokens *= model_config.mapping.world_size
-        self.moe_max_num_tokens = model_config.moe_max_num_tokens or max_num_tokens
+        moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
+        self.moe_max_num_tokens = model_config.moe_max_num_tokens or moe_max_num_tokens
         # The auxiliary CUDA stream and CUDA events are only used when MoE chunking is applied
-        if self.moe_max_num_tokens < max_num_tokens:
+        if self.moe_max_num_tokens < moe_max_num_tokens:
             self.aux_stream = aux_stream_dict[
                 AuxStreamType.
                 MoeChunkingOverlap] if aux_stream_dict is not None else torch.cuda.Stream(