NVIDIA · nv-guomingz · Jul 7, 2025
@@ -196,7 +196,7 @@ We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
   - 896
   - 512
@@ -263,7 +263,7 @@ YOUR_DATA_PATH=./dataset.txt
 
 cat >./extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
   - 1
   - 2

@@ -157,7 +157,7 @@ These optimizations target the overall execution flow, scheduling, and resource
 
     There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation.
 
-    Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n  padding_enabled: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
+    Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n  enable_padding: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
 
 * Overlap Scheduler:
 

@@ -201,7 +201,7 @@ trtllm-bench --model $model_name throughput --dataset $dataset_file --backend py
 `llm_options.yml`
 ```yaml
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2

@@ -190,7 +190,7 @@ def gen_config_file(config_path: str,
             'max_seq_len': 8576,
             'free_gpu_memory_fraction': gen_gpu_memory_fraction,
             'cuda_graph_config': {
-                'padding_enabled': True,
+                'enable_padding': True,
                 'batch_sizes': gen_cuda_graph_batch_sizes,
             },
             'print_iter_log': True,

@@ -21,7 +21,7 @@ def example_cuda_graph_config():
 
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=[1, 2, 4],
-        padding_enabled=True,
+        enable_padding=True,
     )
 
     llm = LLM(

@@ -188,7 +188,7 @@ def setup_llm(args):
 
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=args.cuda_graph_batch_sizes,
-        padding_enabled=args.cuda_graph_padding_enabled,
+        enable_padding=args.cuda_graph_padding_enabled,
     ) if args.use_cuda_graph else None
     llm = LLM(
         model=args.model_dir,

@@ -142,7 +142,7 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes: [1, 4, 8, 12]
 EOF
 
@@ -169,7 +169,7 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes: [1, 2]
 moe_max_num_tokens: 16384
 EOF
@@ -237,7 +237,7 @@ To serve the model using `trtllm-serve`:
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
@@ -316,7 +316,7 @@ export TRTLLM_USE_UCX_KVCACHE=1
 
 cat >./gen-extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
@@ -539,7 +539,7 @@ python3 /path/to/TensorRT-LLM/benchmarks/cpp/prepare_dataset.py \
 
 cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2

@@ -29,15 +29,15 @@ enable_attention_dp: true
 stream_interval: 2
 cuda_graph_config:
   max_batch_size: 512
-  padding_enabled: true
+  enable_padding: true
 EOF
 ```
 Explanation:
 - `enable_attention_dp`: Enable attention Data Parallel which is recommend to enable in high concurrency.
 - `stream_interval`: The iteration interval to create responses under the streaming mode.
 - `cuda_graph_config`: CUDA Graph config.
   - `max_batch_size`: Max CUDA graph batch size to capture.
-  - `padding_enabled`: Whether to enable CUDA graph padding.
+  - `enable_padding`: Whether to enable CUDA graph padding.
 
 
 #### 2. Launch trtllm-serve OpenAI-compatible API server
@@ -81,7 +81,7 @@ enable_min_latency: true
 stream_interval: 2
 cuda_graph_config:
   max_batch_size: 8
-  padding_enabled: true
+  enable_padding: true
 EOF
 ```
 Explanation:
@@ -90,7 +90,7 @@ Explanation:
 - `stream_interval`: The iteration interval to create responses under the streaming mode.
 - `cuda_graph_config`: CUDA Graph config.
   - `max_batch_size`: Max CUDA graph batch size to capture.
-  - `padding_enabled`: Whether to enable CUDA graph padding.
+  - `enable_padding`: Whether to enable CUDA graph padding.
 
 
 #### 2. Launch trtllm-serve OpenAI-compatible API server

@@ -745,7 +745,7 @@ To serve the model using `trtllm-serve`:
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
   - 1
   - 2
@@ -821,7 +821,7 @@ export TRTLLM_USE_UCX_KVCACHE=1
 
 cat >./gen-extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2

@@ -196,7 +196,7 @@ def gen_config_file(config_path: str,
             'max_seq_len': 2176,
             'free_gpu_memory_fraction': gen_gpu_memory_fraction,
             'cuda_graph_config': {
-                'padding_enabled': True,
+                'enable_padding': True,
                 'batch_sizes': gen_cuda_graph_batch_sizes,
             },
             'print_iter_log': True,

@@ -309,7 +309,7 @@ def get_rank_model_storage(model):
 def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
                                    max_batch_size: int, max_num_tokens: int,
                                    max_draft_len: int,
-                                   padding_enabled: bool) -> list[int]:
+                                   enable_padding: bool) -> list[int]:
     # This is the largest possible batch size for a pure decoding batch.
     max_cuda_graph_bs = min(max_batch_size,
                             int(max_num_tokens / (1 + max_draft_len)))
@@ -326,8 +326,8 @@ def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
             # is that if the user is OK padding to a batch size B, they should also
             # be OK with padding to some size B' < B since the performance will generally
             # just be better in the smaller case.
-            if padding_enabled and (i == 0
-                                    or result[i - 1] != max_cuda_graph_bs):
+            if enable_padding and (i == 0
+                                   or result[i - 1] != max_cuda_graph_bs):
                 logger.warning(
                     "CUDA graph padding is enabled, but one of the given CUDA graph "
                     f"batch sizes ({bs}) is larger than the executor's max batch size "

@@ -152,7 +152,7 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
             pass
 
     cuda_graph_config = {
-        "padding_enabled": True,
+        "enable_padding": True,
         "max_batch_size": max_batch_size
     }
 

@@ -72,7 +72,7 @@ class CudaGraphConfig(BaseModel):
     max_batch_size: int = Field(
         default=0, description="Maximum batch size for CUDA graphs.")
 
-    padding_enabled: bool = Field(
+    enable_padding: bool = Field(
         default=False,
         description=
         "If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance."
@@ -1917,17 +1917,17 @@ def validate_stream_interval(self):
 
     @staticmethod
     def _generate_cuda_graph_batch_sizes(max_batch_size: int,
-                                         padding_enabled: bool) -> List[int]:
+                                         enable_padding: bool) -> List[int]:
         """Generate a list of batch sizes for CUDA graphs.
 
         Args:
             max_batch_size: Maximum batch size to generate up to
-            padding_enabled: Whether padding is enabled, which affects the batch size distribution
+            enable_padding: Whether padding is enabled, which affects the batch size distribution
 
         Returns:
             List of batch sizes to create CUDA graphs for
         """
-        if padding_enabled:
+        if enable_padding:
             batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)]
         else:
             batch_sizes = list(range(1, 32)) + [32, 64, 128]
@@ -1965,7 +1965,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
             config.batch_sizes = sorted(config.batch_sizes)
             if config.max_batch_size != 0:
                 if config.batch_sizes != self._generate_cuda_graph_batch_sizes(
-                        config.max_batch_size, config.padding_enabled):
+                        config.max_batch_size, config.enable_padding):
                     raise ValueError(
                         "Please don't set both cuda_graph_config.batch_sizes "
                         "and cuda_graph_config.max_batch_size.\n"
@@ -1977,7 +1977,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
         else:
             max_batch_size = config.max_batch_size or 128
             generated_sizes = self._generate_cuda_graph_batch_sizes(
-                max_batch_size, config.padding_enabled)
+                max_batch_size, config.enable_padding)
             config.batch_sizes = generated_sizes
             config.max_batch_size = max_batch_size
 
@@ -1996,9 +1996,9 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             cuda_graph_max_batch_size=self.cuda_graph_config.max_batch_size
             if self.cuda_graph_config else
             CudaGraphConfig.model_fields['max_batch_size'].default,
-            cuda_graph_padding_enabled=self.cuda_graph_config.padding_enabled
+            cuda_graph_padding_enabled=self.cuda_graph_config.enable_padding
             if self.cuda_graph_config else
-            CudaGraphConfig.model_fields['padding_enabled'].default,
+            CudaGraphConfig.model_fields['enable_padding'].default,
             disable_overlap_scheduler=self.disable_overlap_scheduler,
             moe_max_num_tokens=self.moe_max_num_tokens,
             moe_load_balancer=self.moe_load_balancer,

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -97,7 +97,7 @@ def test_bfloat16(self, attn_backend, torch_compile):
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -123,7 +123,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -147,7 +147,7 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -185,7 +185,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -759,7 +759,7 @@ def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
             disable_overlap_scheduler=False,
             cuda_graph_config=CudaGraphConfig(
                 max_batch_size=512,
-                padding_enabled=True,
+                enable_padding=True,
             ),
         )
         with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
@@ -782,7 +782,7 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
         pytorch_config = dict(
             disable_overlap_scheduler=False,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=True),
+            cuda_graph_config=CudaGraphConfig(enable_padding=True),
         )
         quant_config = QuantConfig()
         quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
@@ -1854,7 +1854,7 @@ class TestKanana_Instruct(LlmapiAccuracyTestHarness):
     def test_auto_dtype(self):
         "RCCA: https://nvbugspro.nvidia.com/bug/5310520"
         pytorch_config = dict(cuda_graph_config=CudaGraphConfig(
-            padding_enabled=True, max_batch_size=384))
+            enable_padding=True, max_batch_size=384))
         with LLM(self.MODEL_PATH, **pytorch_config,
                  enable_attention_dp=True) as llm:
             task = MMLU(self.MODEL_NAME)

diff --git a/...configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/...configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
@@ -17,7 +17,7 @@ generation_servers:
   pipeline_parallel_size: 1
   enable_attention_dp: true
   cuda_graph_config:
-    padding_enabled: False
+    enable_padding: False
   disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/...regated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/...regated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
@@ -15,7 +15,7 @@ generation_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   cuda_graph_config:
-    padding_enabled: False
+    enable_padding: False
   disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
@@ -28,7 +28,7 @@ generation_servers:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
   cuda_graph_config:
-    padding_enabled: True
+    enable_padding: True
     batch_sizes: [1,4,8,16,24,32]
   disable_overlap_scheduler: True
   urls:

diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
@@ -30,7 +30,7 @@ def get_model_yaml_config(model_label: str,
     base_config = {
         'print_iter_log': True,
         'cuda_graph_config': {
-            'padding_enabled': True,
+            'enable_padding': True,
         },
     }
     if 'kv_cache_dtype' in model_label:
@@ -66,7 +66,7 @@ def get_model_yaml_config(model_label: str,
             'config': {
                 'enable_attention_dp': True,
                 'cuda_graph_config': {
-                    'padding_enabled': True,
+                    'enable_padding': True,
                     'batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
                 }
             }
@@ -89,7 +89,7 @@ def get_model_yaml_config(model_label: str,
             'config': {
                 'print_iter_log': True,
                 'cuda_graph_config': {
-                    'padding_enabled': True,
+                    'enable_padding': True,
                     'batch_sizes': [1, 512, 1024, 2048]
                 }
             }

diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
@@ -519,7 +519,7 @@ def stress_test(config,
         if config.backend == "pytorch":
             extra_llm_options.update({
                 "cuda_graph_config": {
-                    "padding_enabled": True,
+                    "enable_padding": True,
                     "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
                 },
                 "print_iter_log": True,

diff --git a/tests/unittest/_torch/test_pytorch_model_engine.py b/tests/unittest/_torch/test_pytorch_model_engine.py
@@ -307,8 +307,7 @@ def test_cuda_graph_enable(self):
             "CUDA graphs should be disabled when cuda_graph_config=None")
 
         # Test 4: Custom CudaGraphConfig with specific settings
-        custom_config = CudaGraphConfig(max_batch_size=256,
-                                        padding_enabled=True)
+        custom_config = CudaGraphConfig(max_batch_size=256, enable_padding=True)
         llm_args_custom = LlmArgs.from_kwargs(model="dummy_model",
                                               cuda_graph_config=custom_config)
         pytorch_config_custom = llm_args_custom.get_pytorch_backend_config()