@@ -4450,15 +4450,27 @@ def __post_init__(self):
44504450 self .compilation_config .custom_ops .append ("+rms_norm" )
44514451 if envs .VLLM_USE_V1 and self .model_config is not None and \
44524452 not self .model_config .enforce_eager :
4453- # FIXME(rob): Add function to set all of these.
4454- if not self .compilation_config .custom_ops :
4455- self .compilation_config .custom_ops = ["none" ]
4453+ # By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
4454+ # is set to True, full CUDA graphs will be used.
44564455 self .compilation_config .cudagraph_num_of_warmups = 1
44574456 self .compilation_config .pass_config .enable_fusion = False
44584457 self .compilation_config .pass_config .enable_noop = False
44594458 self .compilation_config .level = CompilationLevel .PIECEWISE
44604459 self .compilation_config .set_splitting_ops_for_v1 ()
44614460
4461+ # The behavior of custom ops with inductor depends on the config:
4462+ # - If use_inductor=True and custom_ops is empty:
4463+ # Inductor generates Triton kernels for all registered custom ops
4464+ # (default behavior)
4465+ # - If use_inductor=True and custom_ops is non-empty:
4466+ # Custom CUDA kernels are used for specified ops while inductor
4467+ # generates Triton kernels for remaining ops, including misc torch
4468+ # ops in the model.
4469+ if (not self .compilation_config .custom_ops
4470+ and self .compilation_config .use_inductor ):
4471+ # Let inductor generate Triton kernels for the custom ops.
4472+ self .compilation_config .custom_ops = ["none" ]
4473+
44624474 self ._set_cudagraph_sizes ()
44634475
44644476 if self .cache_config .cpu_offload_gb > 0 and \
0 commit comments