Skip to content

Commit aafbbd9

Browse files
authored
[torch.compile] Use custom ops when use_inductor=False (#19618)
1 parent 0f08745 commit aafbbd9

File tree

1 file changed

+15
-3
lines changed

1 file changed

+15
-3
lines changed

vllm/config.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4450,15 +4450,27 @@ def __post_init__(self):
44504450
self.compilation_config.custom_ops.append("+rms_norm")
44514451
if envs.VLLM_USE_V1 and self.model_config is not None and \
44524452
not self.model_config.enforce_eager:
4453-
# FIXME(rob): Add function to set all of these.
4454-
if not self.compilation_config.custom_ops:
4455-
self.compilation_config.custom_ops = ["none"]
4453+
# By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
4454+
# is set to True, full CUDA graphs will be used.
44564455
self.compilation_config.cudagraph_num_of_warmups = 1
44574456
self.compilation_config.pass_config.enable_fusion = False
44584457
self.compilation_config.pass_config.enable_noop = False
44594458
self.compilation_config.level = CompilationLevel.PIECEWISE
44604459
self.compilation_config.set_splitting_ops_for_v1()
44614460

4461+
# The behavior of custom ops with inductor depends on the config:
4462+
# - If use_inductor=True and custom_ops is empty:
4463+
# Inductor generates Triton kernels for all registered custom ops
4464+
# (default behavior)
4465+
# - If use_inductor=True and custom_ops is non-empty:
4466+
# Custom CUDA kernels are used for specified ops while inductor
4467+
# generates Triton kernels for remaining ops, including misc torch
4468+
# ops in the model.
4469+
if (not self.compilation_config.custom_ops
4470+
and self.compilation_config.use_inductor):
4471+
# Let inductor generate Triton kernels for the custom ops.
4472+
self.compilation_config.custom_ops = ["none"]
4473+
44624474
self._set_cudagraph_sizes()
44634475

44644476
if self.cache_config.cpu_offload_gb > 0 and \

0 commit comments

Comments
 (0)