From 4f6e1b4360124fc9d0148fe37c1a4e39d2820d7d Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 4 Sep 2025 15:02:32 -0700 Subject: [PATCH 01/29] init Signed-off-by: Boyuan Feng --- vllm/attention/layer.py | 2 + vllm/compilation/backends.py | 80 +++++++++++++++++++------- vllm/compilation/compiler_interface.py | 1 + vllm/config/compilation.py | 2 +- vllm/v1/cudagraph_dispatcher.py | 16 +++--- 5 files changed, 72 insertions(+), 29 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 44cb2c7c6b64..cc562b239479 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -558,6 +558,7 @@ def unified_attention_fake( mutates_args=[], fake_impl=unified_attention_fake, dispatch_key=current_platform.dispatch_key, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) @@ -608,4 +609,5 @@ def unified_attention_with_output_fake( mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, dispatch_key=current_platform.dispatch_key, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 3cc0fc3106f5..6572e176b486 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -15,6 +15,7 @@ from torch._dispatch.python import enable_python_dispatcher import vllm.envs as envs +from vllm.attention.layer import Attention from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform @@ -326,6 +327,45 @@ def call_module(self, target: torch.fx.node.Target, i for i, x in enumerate(args) if isinstance(x, torch.SymInt) ] global compilation_start_time + # torch._inductor.config.triton.customized_cudagraph_wrappers = [lambda f: CUDAGraphWrapper(f, VllmConfig(), CUDAGraphMode.PIECEWISE), lambda f: f] + + # self.module.__dict__[target] = static_graph_wrapper_class( + # runnable=piecewise_backend, + # vllm_config=self.vllm_config, + # runtime_mode=CUDAGraphMode.PIECEWISE, + # cudagraph_options=CUDAGraphOptions( + # debug_log_enable=piecewise_backend.is_first_graph, + # gc_disable=not piecewise_backend.is_first_graph, + # weak_ref_output=piecewise_backend.is_last_graph)) + + from .cuda_graph import CUDAGraphOptions + cudagraph_options_first = CUDAGraphOptions(debug_log_enable=True, + gc_disable=not True, + weak_ref_output=False) + + cudagraph_options_mid = CUDAGraphOptions(debug_log_enable=False, + gc_disable=not False, + weak_ref_output=False) + + cudagraph_options_last = CUDAGraphOptions(debug_log_enable=False, + gc_disable=not False, + weak_ref_output=True) + + num_layers = len( + list(x for x in self.vllm_config.compilation_config. + static_forward_context if isinstance(x, Attention))) + 1 + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls()) + make_fn = lambda i: lambda f: static_graph_wrapper_class( + runnable=f, + vllm_config=self.vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions(True, i != 0, i == + num_layers - 1)) + fns = [make_fn(i) for i in range(num_layers)] + # self.vllm_config.compilation_config.static_forward_context.attention_layer + self.compilation_config.inductor_compile_config[ + "triton.customized_cudagraph_wrappers"] = fns compiled_graph_for_dynamic_shape = self.vllm_backend.\ compiler_manager.compile( submod, @@ -344,26 +384,26 @@ def call_module(self, target: torch.fx.node.Target, len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_dynamic_shape, self.vllm_backend) - if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: - # resolve the static graph wrapper class (e.g. CUDAGraphWrapper - # class) as platform dependent. - static_graph_wrapper_class = resolve_obj_by_qualname( - current_platform.get_static_graph_wrapper_cls()) - - # Always assign PIECEWISE runtime mode to the - # CUDAGraphWrapper for piecewise_backend, to distinguish - # it from the FULL cudagraph runtime mode, no matter it - # is wrapped on a full or piecewise fx graph. - self.module.__dict__[target] = static_graph_wrapper_class( - runnable=piecewise_backend, - vllm_config=self.vllm_config, - runtime_mode=CUDAGraphMode.PIECEWISE, - cudagraph_options=CUDAGraphOptions( - debug_log_enable=piecewise_backend.is_first_graph, - gc_disable=not piecewise_backend.is_first_graph, - weak_ref_output=piecewise_backend.is_last_graph)) - else: - self.module.__dict__[target] = piecewise_backend + # if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + # # resolve the static graph wrapper class (e.g. CUDAGraphWrapper + # # class) as platform dependent. + # static_graph_wrapper_class = resolve_obj_by_qualname( + # current_platform.get_static_graph_wrapper_cls()) + + # # Always assign PIECEWISE runtime mode to the + # # CUDAGraphWrapper for piecewise_backend, to distinguish + # # it from the FULL cudagraph runtime mode, no matter it + # # is wrapped on a full or piecewise fx graph. + # self.module.__dict__[target] = static_graph_wrapper_class( + # runnable=piecewise_backend, + # vllm_config=self.vllm_config, + # runtime_mode=CUDAGraphMode.PIECEWISE, + # cudagraph_options=CUDAGraphOptions( + # debug_log_enable=piecewise_backend.is_first_graph, + # gc_disable=not piecewise_backend.is_first_graph, + # weak_ref_output=piecewise_backend.is_last_graph)) + # else: + self.module.__dict__[target] = piecewise_backend compilation_counter.num_piecewise_capturable_graphs_seen += 1 diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 7158fd685964..36abea709561 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -344,6 +344,7 @@ def hijacked_compile_fx_inner(*args, **kwargs): inductor_compiled_graph = output if inductor_compiled_graph is not None: nonlocal file_path + breakpoint() compiled_fn = inductor_compiled_graph.current_callable file_path = compiled_fn.__code__.co_filename # noqa if not file_path.startswith(self.base_cache_dir): diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index f8ccc2022261..2dd45d573bbc 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -560,7 +560,7 @@ def set_splitting_ops_for_v1(self): "using attention backends that support cudagraph or set " "cudagraph_mode to NONE explicitly if encountering " "any problems.") - self.cudagraph_mode = CUDAGraphMode.FULL + # self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput": diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index d2db7dcb3f09..1da6de67d986 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -39,14 +39,14 @@ def __init__(self, vllm_config: VllmConfig): CUDAGraphMode.FULL: set(), } - assert not self.cudagraph_mode.requires_piecewise_compilation() or \ - (self.compilation_config.level == CompilationLevel.PIECEWISE and - self.compilation_config.splitting_ops_contain_attention()), \ - "Compilation level should be CompilationLevel.PIECEWISE when "\ - "cudagraph_mode piecewise cudagraphs is used, "\ - f"cudagraph_mode={self.cudagraph_mode}, "\ - f"compilation_level={self.compilation_config.level}, "\ - f"splitting_ops={self.compilation_config.splitting_ops}" + # assert not self.cudagraph_mode.requires_piecewise_compilation() or \ + # (self.compilation_config.level == CompilationLevel.PIECEWISE and + # self.compilation_config.splitting_ops_contain_attention()), \ + # "Compilation level should be CompilationLevel.PIECEWISE when "\ + # "cudagraph_mode piecewise cudagraphs is used, "\ + # f"cudagraph_mode={self.cudagraph_mode}, "\ + # f"compilation_level={self.compilation_config.level}, "\ + # f"splitting_ops={self.compilation_config.splitting_ops}" self.keys_initialized = False From 1c1b600980b031c48e1f9d5e80bc07e7129443db Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 4 Sep 2025 16:21:37 -0700 Subject: [PATCH 02/29] nit Signed-off-by: Boyuan Feng --- vllm/compilation/backends.py | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 6572e176b486..40a6c440e7be 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -327,29 +327,8 @@ def call_module(self, target: torch.fx.node.Target, i for i, x in enumerate(args) if isinstance(x, torch.SymInt) ] global compilation_start_time - # torch._inductor.config.triton.customized_cudagraph_wrappers = [lambda f: CUDAGraphWrapper(f, VllmConfig(), CUDAGraphMode.PIECEWISE), lambda f: f] - - # self.module.__dict__[target] = static_graph_wrapper_class( - # runnable=piecewise_backend, - # vllm_config=self.vllm_config, - # runtime_mode=CUDAGraphMode.PIECEWISE, - # cudagraph_options=CUDAGraphOptions( - # debug_log_enable=piecewise_backend.is_first_graph, - # gc_disable=not piecewise_backend.is_first_graph, - # weak_ref_output=piecewise_backend.is_last_graph)) from .cuda_graph import CUDAGraphOptions - cudagraph_options_first = CUDAGraphOptions(debug_log_enable=True, - gc_disable=not True, - weak_ref_output=False) - - cudagraph_options_mid = CUDAGraphOptions(debug_log_enable=False, - gc_disable=not False, - weak_ref_output=False) - - cudagraph_options_last = CUDAGraphOptions(debug_log_enable=False, - gc_disable=not False, - weak_ref_output=True) num_layers = len( list(x for x in self.vllm_config.compilation_config. @@ -362,10 +341,12 @@ def call_module(self, target: torch.fx.node.Target, runtime_mode=CUDAGraphMode.PIECEWISE, cudagraph_options=CUDAGraphOptions(True, i != 0, i == num_layers - 1)) - fns = [make_fn(i) for i in range(num_layers)] - # self.vllm_config.compilation_config.static_forward_context.attention_layer + self.compilation_config.inductor_compile_config[ - "triton.customized_cudagraph_wrappers"] = fns + "customized_partition_wrappers"] = [ + make_fn(i) for i in range(num_layers) + ] + compiled_graph_for_dynamic_shape = self.vllm_backend.\ compiler_manager.compile( submod, @@ -376,7 +357,6 @@ def call_module(self, target: torch.fx.node.Target, num_graphs=len(self.compile_submod_names), runtime_shape=None) # Lazy import here to avoid circular import - from .cuda_graph import CUDAGraphOptions from .cuda_piecewise_backend import PiecewiseBackend piecewise_backend = PiecewiseBackend( From 50d1ddacca52ed206c8480111f0b364d46a924b9 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 4 Sep 2025 16:22:22 -0700 Subject: [PATCH 03/29] nit Signed-off-by: Boyuan Feng --- vllm/compilation/compiler_interface.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 36abea709561..7158fd685964 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -344,7 +344,6 @@ def hijacked_compile_fx_inner(*args, **kwargs): inductor_compiled_graph = output if inductor_compiled_graph is not None: nonlocal file_path - breakpoint() compiled_fn = inductor_compiled_graph.current_callable file_path = compiled_fn.__code__.co_filename # noqa if not file_path.startswith(self.base_cache_dir): From 7218e2b32fd3d8b87150b2639cdc5f2de4727f23 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 4 Sep 2025 17:00:17 -0700 Subject: [PATCH 04/29] nit Signed-off-by: Boyuan Feng --- vllm/compilation/backends.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 40a6c440e7be..521b79b1ee8d 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -331,8 +331,8 @@ def call_module(self, target: torch.fx.node.Target, from .cuda_graph import CUDAGraphOptions num_layers = len( - list(x for x in self.vllm_config.compilation_config. - static_forward_context if isinstance(x, Attention))) + 1 + list(v for (k, v) in self.vllm_config.compilation_config. + static_forward_context.items() if isinstance(v, Attention))) + 1 static_graph_wrapper_class = resolve_obj_by_qualname( current_platform.get_static_graph_wrapper_cls()) make_fn = lambda i: lambda f: static_graph_wrapper_class( From 71209e24f86b8d6528fce1eef92fe3a5338efb24 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 4 Sep 2025 22:07:42 -0700 Subject: [PATCH 05/29] cleanup Signed-off-by: Boyuan Feng --- vllm/compilation/backends.py | 84 ++++++++++++++++++--------------- vllm/config/compilation.py | 7 ++- vllm/v1/cudagraph_dispatcher.py | 17 +++---- 3 files changed, 60 insertions(+), 48 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 521b79b1ee8d..28387699c607 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -328,24 +328,28 @@ def call_module(self, target: torch.fx.node.Target, ] global compilation_start_time - from .cuda_graph import CUDAGraphOptions - - num_layers = len( - list(v for (k, v) in self.vllm_config.compilation_config. - static_forward_context.items() if isinstance(v, Attention))) + 1 - static_graph_wrapper_class = resolve_obj_by_qualname( - current_platform.get_static_graph_wrapper_cls()) - make_fn = lambda i: lambda f: static_graph_wrapper_class( - runnable=f, - vllm_config=self.vllm_config, - runtime_mode=CUDAGraphMode.PIECEWISE, - cudagraph_options=CUDAGraphOptions(True, i != 0, i == - num_layers - 1)) - - self.compilation_config.inductor_compile_config[ - "customized_partition_wrappers"] = [ - make_fn(i) for i in range(num_layers) - ] + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and self.compilation_config.use_inductor_graph_partition): + from .cuda_graph import CUDAGraphOptions + + num_layers = len( + list(v for (k, v) in self.vllm_config.compilation_config. + static_forward_context.items() + if isinstance(v, Attention))) + 1 + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls()) + + make_fn = lambda i: lambda f: static_graph_wrapper_class( + runnable=f, + vllm_config=self.vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions(i == 0, i != 0, i == + num_layers - 1)) + + self.compilation_config.inductor_compile_config[ + "customized_partition_wrappers"] = [ + make_fn(i) for i in range(num_layers) + ] compiled_graph_for_dynamic_shape = self.vllm_backend.\ compiler_manager.compile( @@ -364,26 +368,30 @@ def call_module(self, target: torch.fx.node.Target, len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_dynamic_shape, self.vllm_backend) - # if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: - # # resolve the static graph wrapper class (e.g. CUDAGraphWrapper - # # class) as platform dependent. - # static_graph_wrapper_class = resolve_obj_by_qualname( - # current_platform.get_static_graph_wrapper_cls()) - - # # Always assign PIECEWISE runtime mode to the - # # CUDAGraphWrapper for piecewise_backend, to distinguish - # # it from the FULL cudagraph runtime mode, no matter it - # # is wrapped on a full or piecewise fx graph. - # self.module.__dict__[target] = static_graph_wrapper_class( - # runnable=piecewise_backend, - # vllm_config=self.vllm_config, - # runtime_mode=CUDAGraphMode.PIECEWISE, - # cudagraph_options=CUDAGraphOptions( - # debug_log_enable=piecewise_backend.is_first_graph, - # gc_disable=not piecewise_backend.is_first_graph, - # weak_ref_output=piecewise_backend.is_last_graph)) - # else: - self.module.__dict__[target] = piecewise_backend + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and + not self.compilation_config.use_inductor_graph_partition): + from .cuda_graph import CUDAGraphOptions + + # resolve the static graph wrapper class (e.g. CUDAGraphWrapper + # class) as platform dependent. + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls()) + + # Always assign PIECEWISE runtime mode to the + # CUDAGraphWrapper for piecewise_backend, to distinguish + # it from the FULL cudagraph runtime mode, no matter it + # is wrapped on a full or piecewise fx graph. + self.module.__dict__[target] = static_graph_wrapper_class( + runnable=piecewise_backend, + vllm_config=self.vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions( + debug_log_enable=piecewise_backend.is_first_graph, + gc_disable=not piecewise_backend.is_first_graph, + weak_ref_output=piecewise_backend.is_last_graph)) + else: + self.module.__dict__[target] = piecewise_backend compilation_counter.num_piecewise_capturable_graphs_seen += 1 diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 2dd45d573bbc..efdb7227c851 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -299,6 +299,8 @@ class CompilationConfig: minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead. """ + use_inductor_graph_partition: bool = False + pass_config: PassConfig = field(default_factory=PassConfig) """Custom inductor passes, see PassConfig for more details""" @@ -552,7 +554,8 @@ def set_splitting_ops_for_v1(self): elif len(self.splitting_ops) == 0: logger.warning_once("Using piecewise compilation with empty " "splitting_ops.") - if self.cudagraph_mode == CUDAGraphMode.PIECEWISE: + if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE + and not self.use_inductor_graph_partition): logger.warning_once( "When compilation level is piecewise with empty " "splitting_ops, PIECEWISE cudagraph_mode will be " @@ -560,7 +563,7 @@ def set_splitting_ops_for_v1(self): "using attention backends that support cudagraph or set " "cudagraph_mode to NONE explicitly if encountering " "any problems.") - # self.cudagraph_mode = CUDAGraphMode.FULL + self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput": diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 1da6de67d986..ba9f4845ed9b 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -39,14 +39,15 @@ def __init__(self, vllm_config: VllmConfig): CUDAGraphMode.FULL: set(), } - # assert not self.cudagraph_mode.requires_piecewise_compilation() or \ - # (self.compilation_config.level == CompilationLevel.PIECEWISE and - # self.compilation_config.splitting_ops_contain_attention()), \ - # "Compilation level should be CompilationLevel.PIECEWISE when "\ - # "cudagraph_mode piecewise cudagraphs is used, "\ - # f"cudagraph_mode={self.cudagraph_mode}, "\ - # f"compilation_level={self.compilation_config.level}, "\ - # f"splitting_ops={self.compilation_config.splitting_ops}" + if not vllm_config.compilation_config.use_inductor_graph_partition: + assert not self.cudagraph_mode.requires_piecewise_compilation() or \ + (self.compilation_config.level == CompilationLevel.PIECEWISE and + self.compilation_config.splitting_ops_contain_attention()), \ + "Compilation level should be CompilationLevel.PIECEWISE when "\ + "cudagraph_mode piecewise cudagraphs is used, "\ + f"cudagraph_mode={self.cudagraph_mode}, "\ + f"compilation_level={self.compilation_config.level}, "\ + f"splitting_ops={self.compilation_config.splitting_ops}" self.keys_initialized = False From 202b6f3c354249a7abfb4acce4cf661cbb19c66e Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 4 Sep 2025 22:33:15 -0700 Subject: [PATCH 06/29] add doc Signed-off-by: Boyuan Feng --- vllm/compilation/backends.py | 3 ++- vllm/config/compilation.py | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 28387699c607..ea9afd318aa2 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -329,7 +329,8 @@ def call_module(self, target: torch.fx.node.Target, global compilation_start_time if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and self.compilation_config.use_inductor_graph_partition): + and self.compilation_config.use_inductor_graph_partition + and is_torch_equal_or_newer("2.9.0.dev")): from .cuda_graph import CUDAGraphOptions num_layers = len( diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index efdb7227c851..72bb55b10308 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -300,6 +300,24 @@ class CompilationConfig: """ use_inductor_graph_partition: bool = False + """Use inductor graph partition to split the graph at cudagraph_unsafe ops. + This partition happens at inductor codegen time after all passes and fusions + are finished. It generates a single `call` function which wraps + cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops + outside the partition functions. For a graph with N cudagraph-unsafe ops + (e.g., Attention), there would be N partition functions. To mark an op as + cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when + register the custom op. + + This config supports both full cudagraph and piecewise cudagraph without + compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper + to each partition function. For N partition functions, there would be N + CUDAGraph wrapper. + + For full CUDAGraph, we still apply a single CUDAGraph wrapper outside the + inductor `call` function. This captures away all the python-level partition + functions. + """ pass_config: PassConfig = field(default_factory=PassConfig) """Custom inductor passes, see PassConfig for more details""" @@ -463,6 +481,12 @@ def __post_init__(self) -> None: "since full_cuda_graph is deprecated.") self.cudagraph_mode = CUDAGraphMode.FULL + if (self.use_inductor_graph_partition + and not is_torch_equal_or_newer("2.9.0.dev")): + raise ValueError("use_inductor_graph_partition is only " + "supported with torch>=2.9.0.dev. Set " + "use_inductor_graph_partition=False instead.") + def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") From 0b1e18ab87032e5fbd0d53114f19b01739fc5124 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 4 Sep 2025 23:10:05 -0700 Subject: [PATCH 07/29] improve warn/error msg Signed-off-by: Boyuan Feng --- vllm/config/compilation.py | 6 ++++-- vllm/v1/cudagraph_dispatcher.py | 19 ++++++++++--------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 72bb55b10308..1e88d0879335 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -576,8 +576,10 @@ def set_splitting_ops_for_v1(self): # make a copy to avoid mutating the class-level list via reference. self.splitting_ops = list(self._attention_ops) elif len(self.splitting_ops) == 0: - logger.warning_once("Using piecewise compilation with empty " - "splitting_ops.") + logger.warning_once( + "Using piecewise compilation with empty " + "splitting_ops and use_inductor_graph_partition" + f"={self.use_inductor_graph_partition}.") if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE and not self.use_inductor_graph_partition): logger.warning_once( diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index ba9f4845ed9b..eaa3ed47d39e 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -39,15 +39,16 @@ def __init__(self, vllm_config: VllmConfig): CUDAGraphMode.FULL: set(), } - if not vllm_config.compilation_config.use_inductor_graph_partition: - assert not self.cudagraph_mode.requires_piecewise_compilation() or \ - (self.compilation_config.level == CompilationLevel.PIECEWISE and - self.compilation_config.splitting_ops_contain_attention()), \ - "Compilation level should be CompilationLevel.PIECEWISE when "\ - "cudagraph_mode piecewise cudagraphs is used, "\ - f"cudagraph_mode={self.cudagraph_mode}, "\ - f"compilation_level={self.compilation_config.level}, "\ - f"splitting_ops={self.compilation_config.splitting_ops}" + assert not self.cudagraph_mode.requires_piecewise_compilation() or \ + (self.compilation_config.level == CompilationLevel.PIECEWISE and + self.compilation_config.splitting_ops_contain_attention()) or\ + (self.compilation_config.use_inductor_graph_partition and \ + not self.compilation_config.splitting_ops_contain_attention()), \ + "Compilation level should be CompilationLevel.PIECEWISE when "\ + "cudagraph_mode piecewise cudagraphs is used, "\ + f"cudagraph_mode={self.cudagraph_mode}, "\ + f"compilation_level={self.compilation_config.level}, "\ + f"splitting_ops={self.compilation_config.splitting_ops}" self.keys_initialized = False From b66568b603cc0ba4264534bb455ca3679626a07e Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Fri, 5 Sep 2025 15:52:26 -0700 Subject: [PATCH 08/29] match new torch api Signed-off-by: Boyuan Feng --- vllm/compilation/backends.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index ea9afd318aa2..ec8419ac25d4 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -15,7 +15,6 @@ from torch._dispatch.python import enable_python_dispatcher import vllm.envs as envs -from vllm.attention.layer import Attention from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform @@ -331,26 +330,27 @@ def call_module(self, target: torch.fx.node.Target, if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.use_inductor_graph_partition and is_torch_equal_or_newer("2.9.0.dev")): + from torch._inductor.utils import CUDAGraphWrapperMetadata + from .cuda_graph import CUDAGraphOptions - num_layers = len( - list(v for (k, v) in self.vllm_config.compilation_config. - static_forward_context.items() - if isinstance(v, Attention))) + 1 static_graph_wrapper_class = resolve_obj_by_qualname( current_platform.get_static_graph_wrapper_cls()) - make_fn = lambda i: lambda f: static_graph_wrapper_class( - runnable=f, - vllm_config=self.vllm_config, - runtime_mode=CUDAGraphMode.PIECEWISE, - cudagraph_options=CUDAGraphOptions(i == 0, i != 0, i == - num_layers - 1)) - - self.compilation_config.inductor_compile_config[ - "customized_partition_wrappers"] = [ - make_fn(i) for i in range(num_layers) - ] + def customized_cudagraph_wrapper( + f, metadata: CUDAGraphWrapperMetadata): + partition_id = metadata.partition_index + num_partitions = metadata.num_partitions + return static_graph_wrapper_class( + runnable=f, + vllm_config=self.vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions( + partition_id == 0, partition_id != 0, + partition_id == num_partitions - 1)) + + torch._inductor.utils.set_customized_partition_wrappers( + customized_cudagraph_wrapper) compiled_graph_for_dynamic_shape = self.vllm_backend.\ compiler_manager.compile( From 87c74dddb093237af21874728fccaed66cb46fdb Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Mon, 8 Sep 2025 22:06:45 -0700 Subject: [PATCH 09/29] skip cudagraph for get_input_embedding Signed-off-by: Boyuan Feng --- vllm/v1/worker/gpu_model_runner.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d4afaf51e6e8..38fb17e8cce9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1780,13 +1780,21 @@ def _preprocess( self._execute_mm_encoder(scheduler_output) mm_embeds = self._gather_mm_embeddings(scheduler_output) - # NOTE(woosuk): To unify token ids and soft tokens (vision - # embeddings), we always use embeddings (rather than token ids) - # as input to the multimodal model, even when the input is text. - inputs_embeds_scheduled = self.model.get_input_embeddings( - input_ids=self.input_ids.gpu[:num_scheduled_tokens], - multimodal_embeddings=mm_embeds or None, - ) + # Inductor graph partition attempts to wrap all inductor-generated + # functions with CUDAGraph wrapper. Set CUDAGraphMode.None to + # avoid that for computing input embeddings. + with set_forward_context( + None, + self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + ): + # NOTE(woosuk): To unify token ids and soft tokens (vision + # embeddings), we always use embeddings (rather than token ids) + # as input to the multimodal model, even when the input is text. + inputs_embeds_scheduled = self.model.get_input_embeddings( + input_ids=self.input_ids.gpu[:num_scheduled_tokens], + multimodal_embeddings=mm_embeds or None, + ) # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds.gpu[:num_scheduled_tokens].copy_( From c0bd3fb9582f1ae73af4e057310e69042e1ff34d Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 11 Sep 2025 13:09:22 -0700 Subject: [PATCH 10/29] Update vllm/compilation/backends.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Luka Govedič Signed-off-by: Boyuan Feng --- vllm/compilation/backends.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index ec8419ac25d4..b09eb62f27c3 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -346,8 +346,10 @@ def customized_cudagraph_wrapper( vllm_config=self.vllm_config, runtime_mode=CUDAGraphMode.PIECEWISE, cudagraph_options=CUDAGraphOptions( - partition_id == 0, partition_id != 0, - partition_id == num_partitions - 1)) + debug_log_enable=partition_id == 0, + gc_disable=partition_id != 0, + weak_ref_output=partition_id == num_partitions - 1, + )) torch._inductor.utils.set_customized_partition_wrappers( customized_cudagraph_wrapper) From e16e23ac4a5b2905449df014280fda55f46e95a9 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 11 Sep 2025 13:19:53 -0700 Subject: [PATCH 11/29] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Luka Govedič Signed-off-by: Boyuan Feng --- vllm/compilation/backends.py | 2 ++ vllm/config/compilation.py | 11 +++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index b09eb62f27c3..cde3e0fb8323 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -374,6 +374,8 @@ def customized_cudagraph_wrapper( if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and not self.compilation_config.use_inductor_graph_partition): + # We're using Dynamo-based piecewise splitting, so we wrap + # the whole subgraph with a static graph wrapper. from .cuda_graph import CUDAGraphOptions # resolve the static graph wrapper class (e.g. CUDAGraphWrapper diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 1e88d0879335..cde9c4014d91 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -305,18 +305,17 @@ class CompilationConfig: are finished. It generates a single `call` function which wraps cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops outside the partition functions. For a graph with N cudagraph-unsafe ops - (e.g., Attention), there would be N partition functions. To mark an op as + (e.g., Attention), there would be N+1 partitions. To mark an op as cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when register the custom op. This config supports both full cudagraph and piecewise cudagraph without compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper - to each partition function. For N partition functions, there would be N - CUDAGraph wrapper. + to each partition. For N+1 partitions, there would be N+1 + CUDAGraph wrapper instances. - For full CUDAGraph, we still apply a single CUDAGraph wrapper outside the - inductor `call` function. This captures away all the python-level partition - functions. + For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the + inductor `call` function in the model runner. The top-level full cudagraph capture ignores all partitioning. """ pass_config: PassConfig = field(default_factory=PassConfig) From 892ab467a4cff727904706fb8d26575320f5a89b Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 11 Sep 2025 14:21:29 -0700 Subject: [PATCH 12/29] more docs Signed-off-by: Boyuan Feng --- vllm/compilation/backends.py | 10 ++++++++-- vllm/config/compilation.py | 18 +++++++++++++++++- vllm/v1/cudagraph_dispatcher.py | 19 ++++++++++++++----- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index cde3e0fb8323..6ce8f9c42c17 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -328,8 +328,14 @@ def call_module(self, target: torch.fx.node.Target, global compilation_start_time if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and self.compilation_config.use_inductor_graph_partition - and is_torch_equal_or_newer("2.9.0.dev")): + and self.compilation_config.use_inductor_graph_partition): + # If we're using Inductor-based graph partitioning, we currently + # have the whole `fx.Graph` before Inductor lowering and + # and the piecewise splitting happens after all graph + # passes and fusions. Here, we add a custom hook for Inductor + # to wrap each partition with our static graph wrapper class to + # maintain more control over static graph capture and replay. + from torch._inductor.utils import CUDAGraphWrapperMetadata from .cuda_graph import CUDAGraphOptions diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index cde9c4014d91..3da92fee1fc9 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -315,7 +315,8 @@ class CompilationConfig: CUDAGraph wrapper instances. For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the - inductor `call` function in the model runner. The top-level full cudagraph capture ignores all partitioning. + inductor `call` function in the model runner. The top-level full cudagraph + capture ignores all partitioning. """ pass_config: PassConfig = field(default_factory=PassConfig) @@ -442,6 +443,14 @@ def __post_init__(self) -> None: if KEY not in self.inductor_compile_config: self.inductor_compile_config[KEY] = False + if self.use_inductor_graph_partition and not is_torch_equal_or_newer( + "2.9.0.dev"): + logger.warning_once( + "Inductor graph partition requires pytorch 2.9 which is " + "not available. Falling back to " + "use_inductor_graph_partition=False.") + self.use_inductor_graph_partition = False + for k, v in self.inductor_passes.items(): if not isinstance(v, str): assert callable(v), ( @@ -590,6 +599,13 @@ def set_splitting_ops_for_v1(self): "any problems.") self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] + elif self.use_inductor_graph_partition: + logger.warning_once( + "When use_inductor_graph_partition=True, splitting_ops " + "are ignored and set to an empty list. Instead, " + "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is " + "used to annotate custom ops for graph partition.") + self.splitting_ops = [] if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput": # exclude MoE dispatch/combine from capture by ensuring diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index eaa3ed47d39e..a8907bb42cb5 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -39,11 +39,20 @@ def __init__(self, vllm_config: VllmConfig): CUDAGraphMode.FULL: set(), } - assert not self.cudagraph_mode.requires_piecewise_compilation() or \ - (self.compilation_config.level == CompilationLevel.PIECEWISE and - self.compilation_config.splitting_ops_contain_attention()) or\ - (self.compilation_config.use_inductor_graph_partition and \ - not self.compilation_config.splitting_ops_contain_attention()), \ + not_use_piecewise_compilation = ( + not self.cudagraph_mode.requires_piecewise_compilation()) + + use_fx_graph_piecewise_compilation = ( + self.compilation_config.level == CompilationLevel.PIECEWISE + and self.compilation_config.splitting_ops_contain_attention()) + + use_inductor_piecewise_compilation = ( + self.compilation_config.use_inductor_graph_partition + and not self.compilation_config.splitting_ops_contain_attention()) + + assert not_use_piecewise_compilation or \ + use_fx_graph_piecewise_compilation or\ + use_inductor_piecewise_compilation, \ "Compilation level should be CompilationLevel.PIECEWISE when "\ "cudagraph_mode piecewise cudagraphs is used, "\ f"cudagraph_mode={self.cudagraph_mode}, "\ From eabb1b62736dc32dcdd56aca4c1800114136d5d7 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 11 Sep 2025 15:08:39 -0700 Subject: [PATCH 13/29] nit Signed-off-by: Boyuan Feng --- vllm/config/compilation.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 3da92fee1fc9..bdfa72e06743 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -443,14 +443,6 @@ def __post_init__(self) -> None: if KEY not in self.inductor_compile_config: self.inductor_compile_config[KEY] = False - if self.use_inductor_graph_partition and not is_torch_equal_or_newer( - "2.9.0.dev"): - logger.warning_once( - "Inductor graph partition requires pytorch 2.9 which is " - "not available. Falling back to " - "use_inductor_graph_partition=False.") - self.use_inductor_graph_partition = False - for k, v in self.inductor_passes.items(): if not isinstance(v, str): assert callable(v), ( From 04e980198668638b36de3d308fd527d2f26d54ea Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Fri, 12 Sep 2025 14:54:35 -0700 Subject: [PATCH 14/29] Update vllm/v1/cudagraph_dispatcher.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Luka Govedič Signed-off-by: Boyuan Feng --- vllm/v1/cudagraph_dispatcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index a8907bb42cb5..cd8b27df8d05 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -55,6 +55,7 @@ def __init__(self, vllm_config: VllmConfig): use_inductor_piecewise_compilation, \ "Compilation level should be CompilationLevel.PIECEWISE when "\ "cudagraph_mode piecewise cudagraphs is used, "\ + "and attention should be in splitting_ops or inductor splitting should be used" \ f"cudagraph_mode={self.cudagraph_mode}, "\ f"compilation_level={self.compilation_config.level}, "\ f"splitting_ops={self.compilation_config.splitting_ops}" From 6cf5bd5fc4c612a68e88f89c2446a09ce93d2e0a Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Sun, 14 Sep 2025 11:40:02 -0700 Subject: [PATCH 15/29] add piecewise test Signed-off-by: Boyuan Feng --- tests/compile/piecewise/test_simple.py | 54 +++++++++++++++++++++----- tests/compile/silly_attention.py | 1 + vllm/v1/cudagraph_dispatcher.py | 3 +- 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 84f4945c8272..59fbd03b0e19 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -50,16 +50,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -@pytest.mark.parametrize("use_inductor", [True, False]) -@torch.inference_mode() -def test_simple_piecewise_compile(use_inductor): - assert VLLM_USE_V1 - +def _run_simple_model( + splitting_ops, + use_inductor_graph_partition, + use_inductor, + expected_num_piecewise_graphs_seen, + expected_num_piecewise_capturable_graphs_seen, + expected_num_backend_compilations, +): vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, use_inductor=use_inductor, - splitting_ops=["silly.attention"], + splitting_ops=splitting_ops, + use_inductor_graph_partition=use_inductor_graph_partition, cudagraph_copy_inputs=True, cudagraph_capture_sizes=[1, 2], )) @@ -70,9 +74,10 @@ def test_simple_piecewise_compile(use_inductor): with compilation_counter.expect( num_graphs_seen=1, # one graph for the model - num_piecewise_graphs_seen=5, # 2 * num_layers + 1 - num_piecewise_capturable_graphs_seen=3, # 1 + num_layers - num_backend_compilations=3, # num_piecewise_capturable_graphs_seen + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, + num_piecewise_capturable_graphs_seen= + expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, num_cudagraph_captured= 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ), set_forward_context(None, @@ -104,3 +109,34 @@ def test_simple_piecewise_compile(use_inductor): output = model(input) assert get_global_counter() == 2 assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0])) + + +@pytest.mark.parametrize("use_inductor", [True, False]) +@torch.inference_mode() +def test_simple_piecewise_compile(use_inductor): + assert VLLM_USE_V1 + _run_simple_model( + splitting_ops=["silly.attention"], + use_inductor_graph_partition=False, + use_inductor=use_inductor, + expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1 + expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers + expected_num_backend_compilations= + 3, # num_piecewise_capturable_graphs_seen + ) + + +@torch.inference_mode() +def test_simple_inductor_graph_partition(): + assert VLLM_USE_V1 + _run_simple_model( + splitting_ops=[], + use_inductor_graph_partition=True, + use_inductor=True, + expected_num_piecewise_graphs_seen= + 1, # since not splitting at fx graph level + expected_num_piecewise_capturable_graphs_seen= + 1, # since not splitting at fx graph level + expected_num_backend_compilations= + 1, # since not splitting at fx graph level + ) diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py index 13eb0bf4b1fa..baedafbae99f 100644 --- a/tests/compile/silly_attention.py +++ b/tests/compile/silly_attention.py @@ -60,4 +60,5 @@ def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mutates_args=["out"], fake_impl=silly_attention_fake, target_lib=silly_lib, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index cd8b27df8d05..b410cd5f42b1 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -55,7 +55,8 @@ def __init__(self, vllm_config: VllmConfig): use_inductor_piecewise_compilation, \ "Compilation level should be CompilationLevel.PIECEWISE when "\ "cudagraph_mode piecewise cudagraphs is used, "\ - "and attention should be in splitting_ops or inductor splitting should be used" \ + "and attention should be in splitting_ops or inductor "\ + " splitting should be used. " \ f"cudagraph_mode={self.cudagraph_mode}, "\ f"compilation_level={self.compilation_config.level}, "\ f"splitting_ops={self.compilation_config.splitting_ops}" From 70f45dab8db701030c32ca2d765a8be4a0d09a97 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Sun, 14 Sep 2025 14:12:21 -0700 Subject: [PATCH 16/29] lint Signed-off-by: Boyuan Feng --- vllm/v1/cudagraph_dispatcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index b410cd5f42b1..52d19bbad001 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -55,8 +55,8 @@ def __init__(self, vllm_config: VllmConfig): use_inductor_piecewise_compilation, \ "Compilation level should be CompilationLevel.PIECEWISE when "\ "cudagraph_mode piecewise cudagraphs is used, "\ - "and attention should be in splitting_ops or inductor "\ - " splitting should be used. " \ + "and attention should be in splitting_ops or "\ + "inductor splitting should be used. " \ f"cudagraph_mode={self.cudagraph_mode}, "\ f"compilation_level={self.compilation_config.level}, "\ f"splitting_ops={self.compilation_config.splitting_ops}" From 4cce30cb47531ddf4c24c97f41ce35eb92b98466 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Mon, 15 Sep 2025 15:42:43 -0700 Subject: [PATCH 17/29] add custom compile config test Signed-off-by: Boyuan Feng --- tests/compile/test_full_graph.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 84178344a5f3..f76dc07b1473 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -11,8 +11,10 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.config import CompilationConfig, CompilationLevel, PassConfig +from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, + PassConfig) from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer from ..utils import create_new_process_for_each_test @@ -107,6 +109,19 @@ def test_full_graph( (CompilationConfig(level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir()), ("facebook/opt-125m", {})), + ] + [ + # graph inductor partition + ( + CompilationConfig( + level=CompilationLevel.PIECEWISE, + # inductor graph partition uses + # torch._C.Tag.cudagraph_unsafe to specify splitting ops + splitting_ops=[], + use_inductor_graph_partition=True, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + compile_sizes=[1, 2]), + model) for model in models_list(all=False) + if is_torch_equal_or_newer("2.9.0.dev") ]) # only test some of the models @create_new_process_for_each_test() From d3809fb11ed89c03140b6d418da106169195d952 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 16 Sep 2025 11:33:38 -0700 Subject: [PATCH 18/29] more tests for splitting_ops Signed-off-by: Boyuan Feng --- tests/compile/piecewise/test_simple.py | 7 +++++-- tests/compile/test_full_graph.py | 1 - vllm/config/compilation.py | 23 +++++++++++++++-------- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 59fbd03b0e19..8cbc2162348d 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -127,10 +127,13 @@ def test_simple_piecewise_compile(use_inductor): @torch.inference_mode() -def test_simple_inductor_graph_partition(): +@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []]) +def test_simple_inductor_graph_partition(splitting_ops): assert VLLM_USE_V1 _run_simple_model( - splitting_ops=[], + # inductor graph partition automatically resets splitting_ops + # to be an empty list + splitting_ops=splitting_ops, use_inductor_graph_partition=True, use_inductor=True, expected_num_piecewise_graphs_seen= diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index f76dc07b1473..db12a29cbf45 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -116,7 +116,6 @@ def test_full_graph( level=CompilationLevel.PIECEWISE, # inductor graph partition uses # torch._C.Tag.cudagraph_unsafe to specify splitting ops - splitting_ops=[], use_inductor_graph_partition=True, cudagraph_mode=CUDAGraphMode.PIECEWISE, compile_sizes=[1, 2]), diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index bdfa72e06743..1fb16ac05138 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -567,14 +567,21 @@ def set_splitting_ops_for_v1(self): "level is CompilationLevel.PIECEWISE") if self.splitting_ops is None: - # NOTE: When using full cudagraph, instead of setting an empty - # list and capture the full cudagraph inside the flattened fx - # graph, we keep the piecewise fx graph structure but capture the - # full cudagraph outside the fx graph. This reduces some cpu - # overhead when the runtime batch_size is not cudagraph captured. - # see https://github.com/vllm-project/vllm/pull/20059 for details. - # make a copy to avoid mutating the class-level list via reference. - self.splitting_ops = list(self._attention_ops) + if self.use_inductor_graph_partition: + # When using inductor graph partition, we set splitting_ops + # to be empty and rely on torch._C.Tag.cudagraph_unsafe to + # annotate custom ops as splitting ops. + self.splitting_ops = [] + else: + # NOTE: When using full cudagraph, instead of setting an empty + # list and capture the full cudagraph inside the flattened fx + # graph, we keep the piecewise fx graph structure but capture + # the full cudagraph outside the fx graph. This reduces some + # cpu overhead when the runtime batch_size is not cudagraph + # captured. see https://github.com/vllm-project/vllm/pull/20059 + # for details. make a copy to avoid mutating the class-level + # list via reference. + self.splitting_ops = list(self._attention_ops) elif len(self.splitting_ops) == 0: logger.warning_once( "Using piecewise compilation with empty " From d7a73db22cf93e2a9e1c68a817c0156d64a00cd7 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 16 Sep 2025 22:41:33 -0700 Subject: [PATCH 19/29] add tests for attention_quant_pattern Signed-off-by: Boyuan Feng --- tests/compile/test_fusion_attn.py | 41 +++++++++++++++++++++++-------- vllm/config/compilation.py | 13 ++++++---- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 6baf4bf83f49..42d34322049e 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy +import logging from typing import Optional import pytest @@ -339,6 +340,10 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor): @pytest.mark.parametrize( "split_attention", [False, True] if current_platform.is_rocm() else [False]) +# TODO(boyuan): test inductor graph partition on rocm +@pytest.mark.parametrize( + "use_inductor_graph_partition", + [False] if current_platform.is_rocm() else [False, True]) @pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Only test ROCm or CUDA") @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8") @@ -352,7 +357,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, dtype: torch.dtype, model_name: str, model_class: type[AttentionQuantPatternModel], backend: _Backend, split_attention: bool, - monkeypatch, dist_init): + use_inductor_graph_partition: bool, + monkeypatch, dist_init, caplog_vllm): """Test AttentionStaticQuantPattern fusion pass""" monkeypatch.setenv("VLLM_USE_V1", "1") @@ -372,6 +378,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, custom_ops=["+quant_fp8"], + use_inductor_graph_partition=use_inductor_graph_partition, ), cache_config=CacheConfig(cache_dtype="fp8")) @@ -407,9 +414,13 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, vllm_config=vllm_config_unfused) model_unfused = model_unfused.to(device) - forward_ctx = get_forward_context() - forward_ctx.attn_metadata = model_unfused.build_attn_metadata( - batch_size, use_hnd=split_attention) + # TODO(boyuan): the attn_metadata with quantization does not + # work on my server. So skip for inductor graph partition + # test for now. + if not use_inductor_graph_partition: + forward_ctx = get_forward_context() + forward_ctx.attn_metadata = model_unfused.build_attn_metadata( + batch_size, use_hnd=split_attention) # Run model directly without compilation and fusion result_unfused = model_unfused(q, k, v) @@ -429,9 +440,11 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, w=model_unfused.w) model_fused = model_fused.to(device) - forward_ctx = get_forward_context() - forward_ctx.attn_metadata = model_fused.build_attn_metadata( - batch_size, use_hnd=split_attention) + # TODO(boyuan) + if not use_inductor_graph_partition: + forward_ctx = get_forward_context() + forward_ctx.attn_metadata = model_fused.build_attn_metadata( + batch_size, use_hnd=split_attention) # Create test backend with fusion passes enabled noop_pass = NoOpEliminationPass(vllm_config) @@ -444,16 +457,24 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, backend=test_backend, fullgraph=True) assert model_compiled.attn._o_scale_float is None - result_fused_1 = model_compiled(q, k, v) + + with caplog_vllm.at_level(logging.DEBUG): + result_fused_1 = model_compiled(q, k, v) if backend == _Backend.FLASHINFER: # With the Flashinfer backend after the 1st round of the forward # pass, output quant scale should be loaded into the attn layer's # _o_scale_float, the 2nd round should reuse the loaded # _o_scale_float - assert model_compiled.attn._o_scale_float is not None + if use_inductor_graph_partition: + assert ("Fused quantization onto 1 attention nodes" + in caplog_vllm.text) + else: + assert model_compiled.attn._o_scale_float is not None result_fused_2 = model_compiled(q, k, v) - assert model_compiled.attn._o_scale_float is not None + + if not use_inductor_graph_partition: + assert model_compiled.attn._o_scale_float is not None torch.testing.assert_close(result_unfused, result_fused_2, diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 1fb16ac05138..38e9e5257cd6 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -566,11 +566,18 @@ def set_splitting_ops_for_v1(self): "set_splitting_ops_for_v1 should only be called when " "level is CompilationLevel.PIECEWISE") + use_inductor_graph_partition_msg = ( + "When use_inductor_graph_partition=True, splitting_ops " + "are ignored and set to an empty list. Instead, " + "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is " + "used to annotate custom ops for graph partition.") + if self.splitting_ops is None: if self.use_inductor_graph_partition: # When using inductor graph partition, we set splitting_ops # to be empty and rely on torch._C.Tag.cudagraph_unsafe to # annotate custom ops as splitting ops. + logger.warning_once(use_inductor_graph_partition_msg) self.splitting_ops = [] else: # NOTE: When using full cudagraph, instead of setting an empty @@ -599,11 +606,7 @@ def set_splitting_ops_for_v1(self): self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] elif self.use_inductor_graph_partition: - logger.warning_once( - "When use_inductor_graph_partition=True, splitting_ops " - "are ignored and set to an empty list. Instead, " - "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is " - "used to annotate custom ops for graph partition.") + logger.warning_once(use_inductor_graph_partition_msg) self.splitting_ops = [] if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput": From 289a60e927eefa46e4c3ca5d7eb52f6c80f5480e Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 16 Sep 2025 22:55:46 -0700 Subject: [PATCH 20/29] rearch is_attention_compiled_piecewise Signed-off-by: Boyuan Feng --- vllm/config/compilation.py | 12 ++++++++++++ vllm/v1/cudagraph_dispatcher.py | 13 ++----------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 38e9e5257cd6..e1d5c448d750 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -624,3 +624,15 @@ def set_splitting_ops_for_v1(self): def splitting_ops_contain_attention(self) -> bool: return self.splitting_ops is not None and all( op in self.splitting_ops for op in self._attention_ops) + + def is_attention_compiled_piecewise(self) -> bool: + use_fx_graph_piecewise_compilation = ( + self.level == CompilationLevel.PIECEWISE + and self.splitting_ops_contain_attention()) + + use_inductor_piecewise_compilation = ( + self.use_inductor_graph_partition + and not self.splitting_ops_contain_attention()) + + return use_fx_graph_piecewise_compilation or \ + use_inductor_piecewise_compilation diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 52d19bbad001..ea4fba8eeea6 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional -from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig +from vllm.config import CUDAGraphMode, VllmConfig from vllm.forward_context import BatchDescriptor from vllm.logger import init_logger @@ -42,17 +42,8 @@ def __init__(self, vllm_config: VllmConfig): not_use_piecewise_compilation = ( not self.cudagraph_mode.requires_piecewise_compilation()) - use_fx_graph_piecewise_compilation = ( - self.compilation_config.level == CompilationLevel.PIECEWISE - and self.compilation_config.splitting_ops_contain_attention()) - - use_inductor_piecewise_compilation = ( - self.compilation_config.use_inductor_graph_partition - and not self.compilation_config.splitting_ops_contain_attention()) - assert not_use_piecewise_compilation or \ - use_fx_graph_piecewise_compilation or\ - use_inductor_piecewise_compilation, \ + self.compilation_config.is_attention_compiled_piecewise(), \ "Compilation level should be CompilationLevel.PIECEWISE when "\ "cudagraph_mode piecewise cudagraphs is used, "\ "and attention should be in splitting_ops or "\ From b5972fa47a54750044cf2e6ee34d1e1bf3b8c8cb Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 17 Sep 2025 22:00:48 -0700 Subject: [PATCH 21/29] move set/unset wrapper to support_torch_compile for frame-specific Signed-off-by: Boyuan Feng --- vllm/compilation/backends.py | 33 ------------------------ vllm/compilation/decorators.py | 47 ++++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 35 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 6ce8f9c42c17..28f1bc1552ab 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -327,39 +327,6 @@ def call_module(self, target: torch.fx.node.Target, ] global compilation_start_time - if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and self.compilation_config.use_inductor_graph_partition): - # If we're using Inductor-based graph partitioning, we currently - # have the whole `fx.Graph` before Inductor lowering and - # and the piecewise splitting happens after all graph - # passes and fusions. Here, we add a custom hook for Inductor - # to wrap each partition with our static graph wrapper class to - # maintain more control over static graph capture and replay. - - from torch._inductor.utils import CUDAGraphWrapperMetadata - - from .cuda_graph import CUDAGraphOptions - - static_graph_wrapper_class = resolve_obj_by_qualname( - current_platform.get_static_graph_wrapper_cls()) - - def customized_cudagraph_wrapper( - f, metadata: CUDAGraphWrapperMetadata): - partition_id = metadata.partition_index - num_partitions = metadata.num_partitions - return static_graph_wrapper_class( - runnable=f, - vllm_config=self.vllm_config, - runtime_mode=CUDAGraphMode.PIECEWISE, - cudagraph_options=CUDAGraphOptions( - debug_log_enable=partition_id == 0, - gc_disable=partition_id != 0, - weak_ref_output=partition_id == num_partitions - 1, - )) - - torch._inductor.utils.set_customized_partition_wrappers( - customized_cudagraph_wrapper) - compiled_graph_for_dynamic_shape = self.vllm_backend.\ compiler_manager.compile( submod, diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 41d9fcb824b0..1e788e936f7e 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -11,10 +11,11 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel, VllmConfig +from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import supports_dynamo +from vllm.utils import resolve_obj_by_qualname, supports_dynamo from .monitor import start_monitoring_torch_compile @@ -302,7 +303,49 @@ def patched_inline_call(parent, func, args, kwargs): with patch.object(InliningInstructionTranslator, 'inline_call', patched_inline_call), torch._dynamo.config.patch( **dynamo_config_patches): + compilation_config = self.vllm_config.compilation_config + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + # If we're using Inductor-based graph partitioning, we + # currently have the whole `fx.Graph` before Inductor + # lowering and and the piecewise splitting happens after + # all graph passes and fusions. Here, we add a custom hook + # for Inductor to wrap each partition with our static + # graph wrapper class to maintain more control over + # static graph capture and replay. + + from torch._inductor.utils import CUDAGraphWrapperMetadata + + from .cuda_graph import CUDAGraphOptions + + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls()) + + def customized_cudagraph_wrapper( + f, metadata: CUDAGraphWrapperMetadata): + partition_id = metadata.partition_index + num_partitions = metadata.num_partitions + return static_graph_wrapper_class( + runnable=f, + vllm_config=self.vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions( + debug_log_enable=partition_id == 0, + gc_disable=partition_id != 0, + weak_ref_output=partition_id == num_partitions + - 1, + )) + + torch._inductor.utils.set_customized_partition_wrappers( + customized_cudagraph_wrapper) + output = self.compiled_callable(*args, **kwargs) + + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + torch._inductor.utils.set_customized_partition_wrappers( + None) + return output # usually, capturing the model once is enough, and then we can From 7570f4b72f828831b95e08de84b8d1fec34caad0 Mon Sep 17 00:00:00 2001 From: boyuanfeng Date: Thu, 18 Sep 2025 05:51:45 +0000 Subject: [PATCH 22/29] update test_attention_quant_pattern Signed-off-by: boyuanfeng --- tests/compile/test_fusion_attn.py | 25 +++++++++---------------- vllm/v1/worker/gpu_model_runner.py | 22 +++++++--------------- 2 files changed, 16 insertions(+), 31 deletions(-) diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 42d34322049e..567b46dfd108 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -414,13 +414,9 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, vllm_config=vllm_config_unfused) model_unfused = model_unfused.to(device) - # TODO(boyuan): the attn_metadata with quantization does not - # work on my server. So skip for inductor graph partition - # test for now. - if not use_inductor_graph_partition: - forward_ctx = get_forward_context() - forward_ctx.attn_metadata = model_unfused.build_attn_metadata( - batch_size, use_hnd=split_attention) + forward_ctx = get_forward_context() + forward_ctx.attn_metadata = model_unfused.build_attn_metadata( + batch_size, use_hnd=split_attention) # Run model directly without compilation and fusion result_unfused = model_unfused(q, k, v) @@ -440,11 +436,9 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, w=model_unfused.w) model_fused = model_fused.to(device) - # TODO(boyuan) - if not use_inductor_graph_partition: - forward_ctx = get_forward_context() - forward_ctx.attn_metadata = model_fused.build_attn_metadata( - batch_size, use_hnd=split_attention) + forward_ctx = get_forward_context() + forward_ctx.attn_metadata = model_fused.build_attn_metadata( + batch_size, use_hnd=split_attention) # Create test backend with fusion passes enabled noop_pass = NoOpEliminationPass(vllm_config) @@ -469,12 +463,11 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, if use_inductor_graph_partition: assert ("Fused quantization onto 1 attention nodes" in caplog_vllm.text) - else: - assert model_compiled.attn._o_scale_float is not None + + assert model_compiled.attn._o_scale_float is not None result_fused_2 = model_compiled(q, k, v) - if not use_inductor_graph_partition: - assert model_compiled.attn._o_scale_float is not None + assert model_compiled.attn._o_scale_float is not None torch.testing.assert_close(result_unfused, result_fused_2, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 823f242f5bad..f256dc160a6b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1863,21 +1863,13 @@ def _preprocess( self._execute_mm_encoder(scheduler_output) mm_embeds = self._gather_mm_embeddings(scheduler_output) - # Inductor graph partition attempts to wrap all inductor-generated - # functions with CUDAGraph wrapper. Set CUDAGraphMode.None to - # avoid that for computing input embeddings. - with set_forward_context( - None, - self.vllm_config, - cudagraph_runtime_mode=CUDAGraphMode.NONE, - ): - # NOTE(woosuk): To unify token ids and soft tokens (vision - # embeddings), we always use embeddings (rather than token ids) - # as input to the multimodal model, even when the input is text. - inputs_embeds_scheduled = self.model.get_input_embeddings( - input_ids=self.input_ids.gpu[:num_scheduled_tokens], - multimodal_embeddings=mm_embeds or None, - ) + # NOTE(woosuk): To unify token ids and soft tokens (vision + # embeddings), we always use embeddings (rather than token ids) + # as input to the multimodal model, even when the input is text. + inputs_embeds_scheduled = self.model.get_input_embeddings( + input_ids=self.input_ids.gpu[:num_scheduled_tokens], + multimodal_embeddings=mm_embeds or None, + ) # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds.gpu[:num_scheduled_tokens].copy_( From c7ff7c42942f2127dc5985a6cbd8b674f43cbeb5 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 18 Sep 2025 13:54:38 -0700 Subject: [PATCH 23/29] Update vllm/config/compilation.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Luka Govedič Signed-off-by: Boyuan Feng --- vllm/config/compilation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index e1d5c448d750..f13391482c29 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -630,9 +630,11 @@ def is_attention_compiled_piecewise(self) -> bool: self.level == CompilationLevel.PIECEWISE and self.splitting_ops_contain_attention()) + inductor_used = (self.level == CompilationLevel.PIECEWISE and self.use_inductor) or (self.level >= CompilationLevel.DYNAMO_AS_IS and self.backend == "inductor") use_inductor_piecewise_compilation = ( - self.use_inductor_graph_partition - and not self.splitting_ops_contain_attention()) + inductor_used and + self.use_inductor_graph_partition and + not self.splitting_ops_contain_attention()) return use_fx_graph_piecewise_compilation or \ use_inductor_piecewise_compilation From 4a38b3695108eb8a893ce0daf956d35dbc6e4e98 Mon Sep 17 00:00:00 2001 From: boyuanfeng Date: Thu, 18 Sep 2025 22:17:46 +0000 Subject: [PATCH 24/29] more tests Signed-off-by: boyuanfeng --- tests/compile/piecewise/test_simple.py | 9 +++++-- tests/compile/test_full_graph.py | 34 ++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 8cbc2162348d..5e85a232a34a 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -57,6 +57,7 @@ def _run_simple_model( expected_num_piecewise_graphs_seen, expected_num_piecewise_capturable_graphs_seen, expected_num_backend_compilations, + expected_num_cudagraph_captured, ): vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, @@ -78,8 +79,7 @@ def _run_simple_model( num_piecewise_capturable_graphs_seen= expected_num_piecewise_capturable_graphs_seen, num_backend_compilations=expected_num_backend_compilations, - num_cudagraph_captured= - 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_cudagraph_captured=expected_num_cudagraph_captured, ), set_forward_context(None, vllm_config=vllm_config): # background context # warm up with background context @@ -123,6 +123,8 @@ def test_simple_piecewise_compile(use_inductor): expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers expected_num_backend_compilations= 3, # num_piecewise_capturable_graphs_seen + expected_num_cudagraph_captured= + 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ) @@ -142,4 +144,7 @@ def test_simple_inductor_graph_partition(splitting_ops): 1, # since not splitting at fx graph level expected_num_backend_compilations= 1, # since not splitting at fx graph level + expected_num_cudagraph_captured= + 6, # inductor graph partition still captures 6 + # graph, same as fx graph partition. ) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index db12a29cbf45..e0ec8a2a6d75 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -3,6 +3,7 @@ from __future__ import annotations +import logging import tempfile from typing import Any, Optional, Union @@ -10,7 +11,9 @@ import torch from tests.quantization.utils import is_quant_method_supported +from tests.v1.attention.utils import _Backend from vllm import LLM, SamplingParams +from vllm.attention.selector import global_force_attn_backend_context_manager from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig) from vllm.platforms import current_platform @@ -133,6 +136,37 @@ def test_custom_compile_config( run_model(compilation_config, model, model_kwargs) +@pytest.mark.parametrize("model", [ + "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", + "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", +]) +def test_inductor_graph_partition_attn_fusion(model, caplog_vllm): + if not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + + print(f"MODEL={model}") + + compilation_config = CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_inductor_graph_partition=True, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + compile_sizes=[1, 2], + custom_ops=["+quant_fp8"], + pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True), + ) + model_kwargs = { + "kv_cache_dtype": "fp8", + "max_model_len": 1024, + } + with caplog_vllm.at_level( + logging.DEBUG), global_force_attn_backend_context_manager( + _Backend.FLASHINFER): + run_model(compilation_config, model, model_kwargs) + + assert ("Fused quantization onto 1 attention nodes" in caplog_vllm.text) + + def run_model(compile_config: Union[int, CompilationConfig], model: str, model_kwargs: dict[str, Any]): prompts = [ From d4269d992634ba9d091f350d6d283c879204c47f Mon Sep 17 00:00:00 2001 From: boyuanfeng Date: Thu, 18 Sep 2025 22:37:43 +0000 Subject: [PATCH 25/29] move wrapper set/unset to context manager Signed-off-by: boyuanfeng --- vllm/compilation/decorators.py | 50 ++++------------------------------ vllm/config/compilation.py | 10 ++++--- vllm/utils/__init__.py | 50 +++++++++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 50 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 1e788e936f7e..c50b95eb9b66 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -11,11 +11,10 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig +from vllm.config import CompilationLevel, VllmConfig from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import resolve_obj_by_qualname, supports_dynamo +from vllm.utils import maybe_use_cudagraph_partition_wrapper, supports_dynamo from .monitor import start_monitoring_torch_compile @@ -302,50 +301,11 @@ def patched_inline_call(parent, func, args, kwargs): with patch.object(InliningInstructionTranslator, 'inline_call', patched_inline_call), torch._dynamo.config.patch( - **dynamo_config_patches): - compilation_config = self.vllm_config.compilation_config - if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and compilation_config.use_inductor_graph_partition): - # If we're using Inductor-based graph partitioning, we - # currently have the whole `fx.Graph` before Inductor - # lowering and and the piecewise splitting happens after - # all graph passes and fusions. Here, we add a custom hook - # for Inductor to wrap each partition with our static - # graph wrapper class to maintain more control over - # static graph capture and replay. - - from torch._inductor.utils import CUDAGraphWrapperMetadata - - from .cuda_graph import CUDAGraphOptions - - static_graph_wrapper_class = resolve_obj_by_qualname( - current_platform.get_static_graph_wrapper_cls()) - - def customized_cudagraph_wrapper( - f, metadata: CUDAGraphWrapperMetadata): - partition_id = metadata.partition_index - num_partitions = metadata.num_partitions - return static_graph_wrapper_class( - runnable=f, - vllm_config=self.vllm_config, - runtime_mode=CUDAGraphMode.PIECEWISE, - cudagraph_options=CUDAGraphOptions( - debug_log_enable=partition_id == 0, - gc_disable=partition_id != 0, - weak_ref_output=partition_id == num_partitions - - 1, - )) - - torch._inductor.utils.set_customized_partition_wrappers( - customized_cudagraph_wrapper) - + **dynamo_config_patches + ), maybe_use_cudagraph_partition_wrapper( + self.vllm_config): output = self.compiled_callable(*args, **kwargs) - if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and compilation_config.use_inductor_graph_partition): - torch._inductor.utils.set_customized_partition_wrappers( - None) - return output # usually, capturing the model once is enough, and then we can diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index f13391482c29..69a923c3cbdb 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -630,11 +630,13 @@ def is_attention_compiled_piecewise(self) -> bool: self.level == CompilationLevel.PIECEWISE and self.splitting_ops_contain_attention()) - inductor_used = (self.level == CompilationLevel.PIECEWISE and self.use_inductor) or (self.level >= CompilationLevel.DYNAMO_AS_IS and self.backend == "inductor") + inductor_used = (self.level == CompilationLevel.PIECEWISE + and self.use_inductor) or ( + self.level >= CompilationLevel.DYNAMO_AS_IS + and self.backend == "inductor") use_inductor_piecewise_compilation = ( - inductor_used and - self.use_inductor_graph_partition and - not self.splitting_ops_contain_attention()) + inductor_used and self.use_inductor_graph_partition + and not self.splitting_ops_contain_attention()) return use_fx_graph_piecewise_compilation or \ use_inductor_piecewise_compilation diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index f13381ecd9ff..6c8786de0ff6 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -77,7 +77,7 @@ if TYPE_CHECKING: from argparse import Namespace - from vllm.config import ModelConfig, VllmConfig + from vllm.config import CUDAGraphMode, ModelConfig, VllmConfig from vllm.sequence import IntermediateTensors logger = init_logger(__name__) @@ -3443,3 +3443,51 @@ def decorate_logs(process_name: Optional[str] = None) -> None: pid = os.getpid() _add_prefix(sys.stdout, process_name, pid) _add_prefix(sys.stderr, process_name, pid) + + +@contextlib.contextmanager +def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig): + """ + Context manager to set/unset customized cudagraph partition wrappers. + + If we're using Inductor-based graph partitioning, we currently have the + whole `fx.Graph` before Inductor lowering and and the piecewise + splitting happens after all graph passes and fusions. Here, we add + a custom hook for Inductor to wrap each partition with our static + graph wrapper class to maintain more control over static graph + capture and replay. + """ + from vllm.platforms import current_platform + + compilation_config = vllm_config.compilation_config + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + from torch._inductor.utils import CUDAGraphWrapperMetadata + + from .cuda_graph import CUDAGraphOptions + + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls()) + + def customized_cudagraph_wrapper(f, + metadata: CUDAGraphWrapperMetadata): + partition_id = metadata.partition_index + num_partitions = metadata.num_partitions + return static_graph_wrapper_class( + runnable=f, + vllm_config=vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions( + debug_log_enable=partition_id == 0, + gc_disable=partition_id != 0, + weak_ref_output=partition_id == num_partitions - 1, + )) + + torch._inductor.utils.set_customized_partition_wrappers( + customized_cudagraph_wrapper) + + yield + + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + torch._inductor.utils.set_customized_partition_wrappers(None) From 20b9ef1a07d9b281dd0c225fd41cc1624ad9f3fb Mon Sep 17 00:00:00 2001 From: boyuanfeng Date: Thu, 18 Sep 2025 22:54:30 +0000 Subject: [PATCH 26/29] nit Signed-off-by: boyuanfeng --- vllm/utils/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 6c8786de0ff6..ed3d04c46b1a 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -77,7 +77,7 @@ if TYPE_CHECKING: from argparse import Namespace - from vllm.config import CUDAGraphMode, ModelConfig, VllmConfig + from vllm.config import ModelConfig, VllmConfig from vllm.sequence import IntermediateTensors logger = init_logger(__name__) @@ -3457,14 +3457,15 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig): graph wrapper class to maintain more control over static graph capture and replay. """ - from vllm.platforms import current_platform + from vllm.config import CUDAGraphMode compilation_config = vllm_config.compilation_config if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE and compilation_config.use_inductor_graph_partition): from torch._inductor.utils import CUDAGraphWrapperMetadata - from .cuda_graph import CUDAGraphOptions + from vllm.compilation.cuda_graph import CUDAGraphOptions + from vllm.platforms import current_platform static_graph_wrapper_class = resolve_obj_by_qualname( current_platform.get_static_graph_wrapper_cls()) From e055458c77fcdc13095beaae63756a0e3f9ff593 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 18 Sep 2025 22:29:06 -0700 Subject: [PATCH 27/29] update test Signed-off-by: Boyuan Feng --- tests/compile/test_full_graph.py | 22 +++++++++++++--------- tests/compile/test_fusion_attn.py | 8 +------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index e0ec8a2a6d75..0c77a90dc348 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -136,22 +136,16 @@ def test_custom_compile_config( run_model(compilation_config, model, model_kwargs) -@pytest.mark.parametrize("model", [ - "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", - "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", -]) -def test_inductor_graph_partition_attn_fusion(model, caplog_vllm): +def test_inductor_graph_partition_attn_fusion(caplog_vllm): if not is_torch_equal_or_newer("2.9.0.dev"): pytest.skip("inductor graph partition is only available " "in PyTorch 2.9+") - print(f"MODEL={model}") - + model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" compilation_config = CompilationConfig( level=CompilationLevel.PIECEWISE, use_inductor_graph_partition=True, cudagraph_mode=CUDAGraphMode.PIECEWISE, - compile_sizes=[1, 2], custom_ops=["+quant_fp8"], pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True), ) @@ -164,7 +158,17 @@ def test_inductor_graph_partition_attn_fusion(model, caplog_vllm): _Backend.FLASHINFER): run_model(compilation_config, model, model_kwargs) - assert ("Fused quantization onto 1 attention nodes" in caplog_vllm.text) + try: + assert ("Fused quantization onto 48 attention nodes" + in caplog_vllm.text), caplog_vllm.text + except AssertionError: + # Note: this message is only triggered when the compilation goes + # through the custom pass. Due to multiple layers of cache on + # PyTorch side, the compilation of a graph may be cached such + # that custom pass directly goes through cache. In this case, + # we go through this branch and assert that the pass is not + # triggered. + assert "Fused quantization" not in caplog_vllm.text def run_model(compile_config: Union[int, CompilationConfig], model: str, diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 567b46dfd108..68db10917260 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy -import logging from typing import Optional import pytest @@ -452,18 +451,13 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, fullgraph=True) assert model_compiled.attn._o_scale_float is None - with caplog_vllm.at_level(logging.DEBUG): - result_fused_1 = model_compiled(q, k, v) + result_fused_1 = model_compiled(q, k, v) if backend == _Backend.FLASHINFER: # With the Flashinfer backend after the 1st round of the forward # pass, output quant scale should be loaded into the attn layer's # _o_scale_float, the 2nd round should reuse the loaded # _o_scale_float - if use_inductor_graph_partition: - assert ("Fused quantization onto 1 attention nodes" - in caplog_vllm.text) - assert model_compiled.attn._o_scale_float is not None result_fused_2 = model_compiled(q, k, v) From 91c03a416d6b54271d9323f0f3196ad6224ca3fb Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Fri, 19 Sep 2025 11:28:36 -0700 Subject: [PATCH 28/29] move maybe_use_cudagraph_partition_wrapper to decorators.py Signed-off-by: Boyuan Feng --- vllm/compilation/decorators.py | 52 +++++++++++++++++++++++++++++++++- vllm/utils/__init__.py | 49 -------------------------------- 2 files changed, 51 insertions(+), 50 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index c50b95eb9b66..b7a6e23c1aa7 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib import inspect from typing import Callable, Optional, TypeVar, Union, overload from unittest.mock import patch @@ -14,7 +15,7 @@ from vllm.config import CompilationLevel, VllmConfig from vllm.logger import init_logger from vllm.sequence import IntermediateTensors -from vllm.utils import maybe_use_cudagraph_partition_wrapper, supports_dynamo +from vllm.utils import resolve_obj_by_qualname, supports_dynamo from .monitor import start_monitoring_torch_compile @@ -317,3 +318,52 @@ def patched_inline_call(parent, func, args, kwargs): cls.__call__ = __call__ return cls + + +@contextlib.contextmanager +def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig): + """ + Context manager to set/unset customized cudagraph partition wrappers. + + If we're using Inductor-based graph partitioning, we currently have the + whole `fx.Graph` before Inductor lowering and and the piecewise + splitting happens after all graph passes and fusions. Here, we add + a custom hook for Inductor to wrap each partition with our static + graph wrapper class to maintain more control over static graph + capture and replay. + """ + from vllm.config import CUDAGraphMode + + compilation_config = vllm_config.compilation_config + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + from torch._inductor.utils import CUDAGraphWrapperMetadata + + from vllm.compilation.cuda_graph import CUDAGraphOptions + from vllm.platforms import current_platform + + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls()) + + def customized_cudagraph_wrapper(f, + metadata: CUDAGraphWrapperMetadata): + partition_id = metadata.partition_index + num_partitions = metadata.num_partitions + return static_graph_wrapper_class( + runnable=f, + vllm_config=vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions( + debug_log_enable=partition_id == 0, + gc_disable=partition_id != 0, + weak_ref_output=partition_id == num_partitions - 1, + )) + + torch._inductor.utils.set_customized_partition_wrappers( + customized_cudagraph_wrapper) + + yield + + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + torch._inductor.utils.set_customized_partition_wrappers(None) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index c1306d7e84bb..d4013a69e99f 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3445,55 +3445,6 @@ def decorate_logs(process_name: Optional[str] = None) -> None: _add_prefix(sys.stderr, process_name, pid) -@contextlib.contextmanager -def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig): - """ - Context manager to set/unset customized cudagraph partition wrappers. - - If we're using Inductor-based graph partitioning, we currently have the - whole `fx.Graph` before Inductor lowering and and the piecewise - splitting happens after all graph passes and fusions. Here, we add - a custom hook for Inductor to wrap each partition with our static - graph wrapper class to maintain more control over static graph - capture and replay. - """ - from vllm.config import CUDAGraphMode - - compilation_config = vllm_config.compilation_config - if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and compilation_config.use_inductor_graph_partition): - from torch._inductor.utils import CUDAGraphWrapperMetadata - - from vllm.compilation.cuda_graph import CUDAGraphOptions - from vllm.platforms import current_platform - - static_graph_wrapper_class = resolve_obj_by_qualname( - current_platform.get_static_graph_wrapper_cls()) - - def customized_cudagraph_wrapper(f, - metadata: CUDAGraphWrapperMetadata): - partition_id = metadata.partition_index - num_partitions = metadata.num_partitions - return static_graph_wrapper_class( - runnable=f, - vllm_config=vllm_config, - runtime_mode=CUDAGraphMode.PIECEWISE, - cudagraph_options=CUDAGraphOptions( - debug_log_enable=partition_id == 0, - gc_disable=partition_id != 0, - weak_ref_output=partition_id == num_partitions - 1, - )) - - torch._inductor.utils.set_customized_partition_wrappers( - customized_cudagraph_wrapper) - - yield - - if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and compilation_config.use_inductor_graph_partition): - torch._inductor.utils.set_customized_partition_wrappers(None) - - def length_from_prompt_token_ids_or_embeds( prompt_token_ids: Optional[list[int]], prompt_embeds: Optional[torch.Tensor], From 19787d3b4c05f49a8c53f546ef60af0f6c968354 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Fri, 19 Sep 2025 15:35:26 -0700 Subject: [PATCH 29/29] test inductor graph partition only when >= torch2.9 Signed-off-by: Boyuan Feng --- tests/compile/piecewise/test_simple.py | 5 +++++ tests/compile/test_full_graph.py | 5 +++++ tests/compile/test_fusion_attn.py | 6 ++++++ 3 files changed, 16 insertions(+) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 5e85a232a34a..41055f431569 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -15,6 +15,7 @@ VllmConfig, set_current_vllm_config) from vllm.envs import VLLM_USE_V1 from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from ..silly_attention import get_global_counter, reset_global_counter @@ -132,6 +133,10 @@ def test_simple_piecewise_compile(use_inductor): @pytest.mark.parametrize("splitting_ops", [["silly.attention"], []]) def test_simple_inductor_graph_partition(splitting_ops): assert VLLM_USE_V1 + if not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + _run_simple_model( # inductor graph partition automatically resets splitting_ops # to be an empty list diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 0c77a90dc348..053236af2725 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -131,6 +131,11 @@ def test_custom_compile_config( compilation_config: CompilationConfig, model_info: tuple[str, dict[str, Any]], ): + if (compilation_config.use_inductor_graph_partition + and not is_torch_equal_or_newer("2.9.0.dev")): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + model, model_kwargs = model_info print(f"MODEL={model}") run_model(compilation_config, model, model_kwargs) diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 68db10917260..022f183b3193 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -27,6 +27,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp) from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer from vllm.v1.kv_cache_interface import AttentionSpec FP8_DTYPE = current_platform.fp8_dtype() @@ -360,6 +361,11 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, monkeypatch, dist_init, caplog_vllm): """Test AttentionStaticQuantPattern fusion pass""" + if use_inductor_graph_partition and not is_torch_equal_or_newer( + "2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + monkeypatch.setenv("VLLM_USE_V1", "1") if split_attention: monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")