Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers
```bash
cat >./extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 896
- 512
Expand Down Expand Up @@ -263,7 +263,7 @@ YOUR_DATA_PATH=./dataset.txt

cat >./extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ These optimizations target the overall execution flow, scheduling, and resource

There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation.

Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n padding_enabled: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n enable_padding: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)

* Overlap Scheduler:

Expand Down
2 changes: 1 addition & 1 deletion docs/source/performance/perf-overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ trtllm-bench --model $model_name throughput --dataset $dataset_file --backend py
`llm_options.yml`
```yaml
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down
2 changes: 1 addition & 1 deletion docs/source/scripts/disaggregated/gen_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def gen_config_file(config_path: str,
'max_seq_len': 8576,
'free_gpu_memory_fraction': gen_gpu_memory_fraction,
'cuda_graph_config': {
'padding_enabled': True,
'enable_padding': True,
'batch_sizes': gen_cuda_graph_batch_sizes,
},
'print_iter_log': True,
Expand Down
2 changes: 1 addition & 1 deletion examples/llm-api/llm_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def example_cuda_graph_config():

cuda_graph_config = CudaGraphConfig(
batch_sizes=[1, 2, 4],
padding_enabled=True,
enable_padding=True,
)

llm = LLM(
Expand Down
2 changes: 1 addition & 1 deletion examples/llm-api/quickstart_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def setup_llm(args):

cuda_graph_config = CudaGraphConfig(
batch_sizes=args.cuda_graph_batch_sizes,
padding_enabled=args.cuda_graph_padding_enabled,
enable_padding=args.cuda_graph_padding_enabled,
) if args.use_cuda_graph else None
llm = LLM(
model=args.model_dir,
Expand Down
10 changes: 5 additions & 5 deletions examples/models/core/deepseek_v3/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \

cat <<EOF > /tmp/extra-llm-api-config.yml
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes: [1, 4, 8, 12]
EOF

Expand All @@ -169,7 +169,7 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \

cat <<EOF > /tmp/extra-llm-api-config.yml
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes: [1, 2]
moe_max_num_tokens: 16384
EOF
Expand Down Expand Up @@ -237,7 +237,7 @@ To serve the model using `trtllm-serve`:
```bash
cat >./extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down Expand Up @@ -316,7 +316,7 @@ export TRTLLM_USE_UCX_KVCACHE=1

cat >./gen-extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down Expand Up @@ -539,7 +539,7 @@ python3 /path/to/TensorRT-LLM/benchmarks/cpp/prepare_dataset.py \

cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down
8 changes: 4 additions & 4 deletions examples/models/core/llama4/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ enable_attention_dp: true
stream_interval: 2
cuda_graph_config:
max_batch_size: 512
padding_enabled: true
enable_padding: true
EOF
```
Explanation:
- `enable_attention_dp`: Enable attention Data Parallel which is recommend to enable in high concurrency.
- `stream_interval`: The iteration interval to create responses under the streaming mode.
- `cuda_graph_config`: CUDA Graph config.
- `max_batch_size`: Max CUDA graph batch size to capture.
- `padding_enabled`: Whether to enable CUDA graph padding.
- `enable_padding`: Whether to enable CUDA graph padding.


#### 2. Launch trtllm-serve OpenAI-compatible API server
Expand Down Expand Up @@ -81,7 +81,7 @@ enable_min_latency: true
stream_interval: 2
cuda_graph_config:
max_batch_size: 8
padding_enabled: true
enable_padding: true
EOF
```
Explanation:
Expand All @@ -90,7 +90,7 @@ Explanation:
- `stream_interval`: The iteration interval to create responses under the streaming mode.
- `cuda_graph_config`: CUDA Graph config.
- `max_batch_size`: Max CUDA graph batch size to capture.
- `padding_enabled`: Whether to enable CUDA graph padding.
- `enable_padding`: Whether to enable CUDA graph padding.


#### 2. Launch trtllm-serve OpenAI-compatible API server
Expand Down
4 changes: 2 additions & 2 deletions examples/models/core/qwen/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ To serve the model using `trtllm-serve`:
```bash
cat >./extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down Expand Up @@ -821,7 +821,7 @@ export TRTLLM_USE_UCX_KVCACHE=1

cat >./gen-extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down
2 changes: 1 addition & 1 deletion examples/wide_ep/slurm_scripts/gen_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def gen_config_file(config_path: str,
'max_seq_len': 2176,
'free_gpu_memory_fraction': gen_gpu_memory_fraction,
'cuda_graph_config': {
'padding_enabled': True,
'enable_padding': True,
'batch_sizes': gen_cuda_graph_batch_sizes,
},
'print_iter_log': True,
Expand Down
6 changes: 3 additions & 3 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def get_rank_model_storage(model):
def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
max_batch_size: int, max_num_tokens: int,
max_draft_len: int,
padding_enabled: bool) -> list[int]:
enable_padding: bool) -> list[int]:
# This is the largest possible batch size for a pure decoding batch.
max_cuda_graph_bs = min(max_batch_size,
int(max_num_tokens / (1 + max_draft_len)))
Expand All @@ -326,8 +326,8 @@ def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
# is that if the user is OK padding to a batch size B, they should also
# be OK with padding to some size B' < B since the performance will generally
# just be better in the smaller case.
if padding_enabled and (i == 0
or result[i - 1] != max_cuda_graph_bs):
if enable_padding and (i == 0
or result[i - 1] != max_cuda_graph_bs):
logger.warning(
"CUDA graph padding is enabled, but one of the given CUDA graph "
f"batch sizes ({bs}) is larger than the executor's max batch size "
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/bench/benchmark/utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
pass

cuda_graph_config = {
"padding_enabled": True,
"enable_padding": True,
"max_batch_size": max_batch_size
}

Expand Down
16 changes: 8 additions & 8 deletions tensorrt_llm/llmapi/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class CudaGraphConfig(BaseModel):
max_batch_size: int = Field(
default=0, description="Maximum batch size for CUDA graphs.")

padding_enabled: bool = Field(
enable_padding: bool = Field(
default=False,
description=
"If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance."
Expand Down Expand Up @@ -1917,17 +1917,17 @@ def validate_stream_interval(self):

@staticmethod
def _generate_cuda_graph_batch_sizes(max_batch_size: int,
padding_enabled: bool) -> List[int]:
enable_padding: bool) -> List[int]:
"""Generate a list of batch sizes for CUDA graphs.

Args:
max_batch_size: Maximum batch size to generate up to
padding_enabled: Whether padding is enabled, which affects the batch size distribution
enable_padding: Whether padding is enabled, which affects the batch size distribution

Returns:
List of batch sizes to create CUDA graphs for
"""
if padding_enabled:
if enable_padding:
batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)]
else:
batch_sizes = list(range(1, 32)) + [32, 64, 128]
Expand Down Expand Up @@ -1965,7 +1965,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
config.batch_sizes = sorted(config.batch_sizes)
if config.max_batch_size != 0:
if config.batch_sizes != self._generate_cuda_graph_batch_sizes(
config.max_batch_size, config.padding_enabled):
config.max_batch_size, config.enable_padding):
raise ValueError(
"Please don't set both cuda_graph_config.batch_sizes "
"and cuda_graph_config.max_batch_size.\n"
Expand All @@ -1977,7 +1977,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
else:
max_batch_size = config.max_batch_size or 128
generated_sizes = self._generate_cuda_graph_batch_sizes(
max_batch_size, config.padding_enabled)
max_batch_size, config.enable_padding)
config.batch_sizes = generated_sizes
config.max_batch_size = max_batch_size

Expand All @@ -1996,9 +1996,9 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
cuda_graph_max_batch_size=self.cuda_graph_config.max_batch_size
if self.cuda_graph_config else
CudaGraphConfig.model_fields['max_batch_size'].default,
cuda_graph_padding_enabled=self.cuda_graph_config.padding_enabled
cuda_graph_padding_enabled=self.cuda_graph_config.enable_padding
if self.cuda_graph_config else
CudaGraphConfig.model_fields['padding_enabled'].default,
CudaGraphConfig.model_fields['enable_padding'].default,
disable_overlap_scheduler=self.disable_overlap_scheduler,
moe_max_num_tokens=self.moe_max_num_tokens,
moe_load_balancer=self.moe_load_balancer,
Expand Down
14 changes: 7 additions & 7 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def test_bfloat16(self, attn_backend, torch_compile):
enable_fullgraph=True) if torch_compile else None
pytorch_config = dict(
torch_compile_config=torch_compile_config,
cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
batch_sizes=[4]),
attn_backend=attn_backend,
disable_overlap_scheduler=torch_compile,
Expand All @@ -123,7 +123,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
enable_fullgraph=True) if torch_compile else None
pytorch_config = dict(
torch_compile_config=torch_compile_config,
cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
batch_sizes=[4]),
attn_backend=attn_backend,
disable_overlap_scheduler=torch_compile,
Expand All @@ -147,7 +147,7 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
enable_fullgraph=True) if torch_compile else None
pytorch_config = dict(
torch_compile_config=torch_compile_config,
cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
batch_sizes=[4]),
attn_backend=attn_backend,
disable_overlap_scheduler=torch_compile,
Expand Down Expand Up @@ -185,7 +185,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
enable_fullgraph=True) if torch_compile else None
pytorch_config = dict(
torch_compile_config=torch_compile_config,
cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
batch_sizes=[4]),
attn_backend=attn_backend,
disable_overlap_scheduler=torch_compile,
Expand Down Expand Up @@ -759,7 +759,7 @@ def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
disable_overlap_scheduler=False,
cuda_graph_config=CudaGraphConfig(
max_batch_size=512,
padding_enabled=True,
enable_padding=True,
),
)
with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
Expand All @@ -782,7 +782,7 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
pytorch_config = dict(
disable_overlap_scheduler=False,
cuda_graph_config=CudaGraphConfig(padding_enabled=True),
cuda_graph_config=CudaGraphConfig(enable_padding=True),
)
quant_config = QuantConfig()
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
Expand Down Expand Up @@ -1854,7 +1854,7 @@ class TestKanana_Instruct(LlmapiAccuracyTestHarness):
def test_auto_dtype(self):
"RCCA: https://nvbugspro.nvidia.com/bug/5310520"
pytorch_config = dict(cuda_graph_config=CudaGraphConfig(
padding_enabled=True, max_batch_size=384))
enable_padding=True, max_batch_size=384))
with LLM(self.MODEL_PATH, **pytorch_config,
enable_attention_dp=True) as llm:
task = MMLU(self.MODEL_NAME)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ generation_servers:
pipeline_parallel_size: 1
enable_attention_dp: true
cuda_graph_config:
padding_enabled: False
enable_padding: False
disable_overlap_scheduler: False
urls:
- "localhost:8002"
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ generation_servers:
tensor_parallel_size: 2
pipeline_parallel_size: 1
cuda_graph_config:
padding_enabled: False
enable_padding: False
disable_overlap_scheduler: False
urls:
- "localhost:8002"
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ generation_servers:
free_gpu_memory_fraction: 0.2
enable_partial_reuse: False
cuda_graph_config:
padding_enabled: True
enable_padding: True
batch_sizes: [1,4,8,16,24,32]
disable_overlap_scheduler: True
urls:
Expand Down
6 changes: 3 additions & 3 deletions tests/integration/defs/perf/pytorch_model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def get_model_yaml_config(model_label: str,
base_config = {
'print_iter_log': True,
'cuda_graph_config': {
'padding_enabled': True,
'enable_padding': True,
},
}
if 'kv_cache_dtype' in model_label:
Expand Down Expand Up @@ -66,7 +66,7 @@ def get_model_yaml_config(model_label: str,
'config': {
'enable_attention_dp': True,
'cuda_graph_config': {
'padding_enabled': True,
'enable_padding': True,
'batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
}
}
Expand All @@ -89,7 +89,7 @@ def get_model_yaml_config(model_label: str,
'config': {
'print_iter_log': True,
'cuda_graph_config': {
'padding_enabled': True,
'enable_padding': True,
'batch_sizes': [1, 512, 1024, 2048]
}
}
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/defs/stress_test/stress_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def stress_test(config,
if config.backend == "pytorch":
extra_llm_options.update({
"cuda_graph_config": {
"padding_enabled": True,
"enable_padding": True,
"batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
},
"print_iter_log": True,
Expand Down
3 changes: 1 addition & 2 deletions tests/unittest/_torch/test_pytorch_model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,7 @@ def test_cuda_graph_enable(self):
"CUDA graphs should be disabled when cuda_graph_config=None")

# Test 4: Custom CudaGraphConfig with specific settings
custom_config = CudaGraphConfig(max_batch_size=256,
padding_enabled=True)
custom_config = CudaGraphConfig(max_batch_size=256, enable_padding=True)
llm_args_custom = LlmArgs.from_kwargs(model="dummy_model",
cuda_graph_config=custom_config)
pytorch_config_custom = llm_args_custom.get_pytorch_backend_config()
Expand Down
Loading