From d675f9d19645c2167352b4c3be57cd8160b13509 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 17 Oct 2025 19:45:05 -0700
Subject: [PATCH 1/3] fix graph partition signature

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/test_fusions_e2e.py |  11 +--
 vllm/env_override.py              | 150 ++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+), 7 deletions(-)

diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py
index 7399abaec542..24f9a874195b 100644
--- a/tests/compile/test_fusions_e2e.py
+++ b/tests/compile/test_fusions_e2e.py
@@ -116,7 +116,7 @@ def test_attn_quant(
     allreduce_fusions: int,
     custom_ops: str,
     inductor_graph_partition: bool,
-    caplog_mp_spawn,
+    caplog_vllm,
     monkeypatch,
 ):
     if backend == _Backend.FLASHINFER and (
@@ -157,14 +157,11 @@ def test_attn_quant(
         inductor_compile_config={"force_disable_caches": True},
     )
 
-    with caplog_mp_spawn(logging.DEBUG) as log_holder:
+    with caplog_vllm.at_level(logging.DEBUG):
         run_model(compilation_config, model_name, **model_kwargs)
 
-    matches = re.findall(
-        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
-        log_holder.text,
-    )
-    assert len(matches) == 1, log_holder.text
+    matches = re.findall(r"Fused quant onto (\d+) attention nodes", caplog_vllm.text)
+    assert len(matches) == 1, caplog_vllm.text
     assert int(matches[0]) == attention_fusions
 
 
diff --git a/vllm/env_override.py b/vllm/env_override.py
index f4ac48584cb7..b20b901ff24d 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -90,6 +90,155 @@ def get_output_names(graph_outputs) -> list[str]:
     assert len(planning_states) == 0
 
 
+# ===================================================
+# torch 2.9 Inductor get_graph_partition_signature monkeypatch
+# ===================================================
+# This change monkeypatches memory_plan_reuse in pytorch 2.9.0 to work around
+# a test failure for `tests/compile/test_fusions_e2e.py -sv -k test_attn_quant`.
+# For more context, see https://github.com/pytorch/pytorch/pull/165815.
+
+
+def get_graph_partition_signature_patched(
+    self, partitions, skip_cudagraphs: list[bool]
+):
+    """
+    Gets signature for each graph partition, including input nodes, output nodes, and
+    whether deallocating an input within graph partition.
+    """
+    from torch._inductor import dependencies
+    from torch._inductor.ir import GraphPartitionSignature, MutationOutput, NoneLayout
+    from torch._inductor.virtualized import V
+    from torch.utils._ordered_set import OrderedSet
+
+    signatures = []
+
+    unmet_output_names = OrderedSet(V.graph.get_output_names())
+    name_to_node = self.get_name_to_nodes()
+
+    def is_none_layout(buf_name: str) -> bool:
+        """
+        Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated
+        so graph partition should not take it as inputs or outputs.
+        """
+        buf = self.name_to_buf.get(buf_name, None)
+
+        if buf is None:
+            return False
+
+        if isinstance(buf.node.layout, NoneLayout):
+            if isinstance(buf.node, MutationOutput) and (
+                real_name := self.mutation_real_name.get(buf_name, None)
+            ):
+                return is_none_layout(real_name)
+
+            return True
+
+        return False
+
+    for partition, skip_cudagraph in zip(
+        reversed(partitions), reversed(skip_cudagraphs)
+    ):
+        output_names: OrderedSet[str] = OrderedSet()
+
+        for node in partition:
+            output_names.update(node.outputs_by_name.keys())
+
+        returned_output_names = output_names.intersection(unmet_output_names)
+
+        # all reads/writes are partition inputs except those generated
+        # within the partition and tensor constants
+        read_writes = dependencies.ReadWrites.merge_list(
+            [node.read_writes for node in partition]
+        )
+
+        # WeakDep is fake dependency on unused buffer. It should not appear
+        # in partition_input_names for inputs that are actually read or written.
+        partition_input_names = (
+            OrderedSet(
+                [
+                    x.name
+                    for x in read_writes.reads | read_writes.writes
+                    if not is_none_layout(x.name)
+                ]
+            )
+            - output_names
+        )
+
+        partition_input_names = OrderedSet(
+            self.mutation_real_name.get(name, name) for name in partition_input_names
+        )
+
+        buffer_names_to_free: OrderedSet[str] = OrderedSet()
+        for node in partition:
+            buffer_names_to_free.update(node.last_usage)
+
+        # buffer_names_to_free may contain buffers allocated in previous
+        # graph partitions. These buffers should also be a partition
+        # input.
+        extra_input_names = [
+            name
+            for name in (buffer_names_to_free - output_names)
+            if name in name_to_node
+        ]
+        partition_input_names.update(extra_input_names)
+
+        input_nodes = {
+            name: name_to_node[name]
+            for name in partition_input_names
+            if name in name_to_node
+        }
+        input_deallocation = {
+            name: name in buffer_names_to_free
+            for name in partition_input_names
+            if name in name_to_node
+        }
+
+        # if an input tensor is not freed in the partition function, it should
+        # also be returned as an output. This brings benefits to cudagraph
+        # since the returned output tensor is a cudagraph managed tensor with
+        # a static tensor address.
+        extra_output_names = [
+            name
+            for name in partition_input_names
+            if name in name_to_node and name not in buffer_names_to_free
+        ]
+
+        returned_output_names.update(extra_output_names)
+
+        returned_output_names = OrderedSet(
+            self.mutation_real_name.get(name, name) for name in returned_output_names
+        )
+
+        output_nodes = [
+            name_to_node[name]
+            for name in returned_output_names
+            if not is_none_layout(name)
+        ]
+
+        constant_names = [
+            name for name in partition_input_names if name in V.graph.constants
+        ]
+
+        symbol_inputs = self.get_graph_partition_symbol_inputs(partition, input_nodes)
+
+        partition_signature = GraphPartitionSignature(
+            symbol_inputs,
+            input_nodes,
+            output_nodes,
+            input_deallocation,
+            skip_cudagraph,
+            constant_names,
+        )
+
+        signatures.append(partition_signature)
+
+        unmet_output_names = partition_input_names.union(
+            unmet_output_names - returned_output_names
+        )
+
+    return signatures[::-1]
+
+
 # ========================================
 # torch 2.9 Inductor Scheduler monkeypatch
 # ========================================
@@ -196,6 +345,7 @@ def _update_scheduler_patched(self) -> None:
     from torch._inductor.scheduler import Scheduler
 
     Scheduler.should_partition = should_partition_patched
+    Scheduler.get_graph_partition_signature = get_graph_partition_signature_patched
 
     with config.patch("triton.store_cubin", False):
         self.scheduler = Scheduler(self.operations)

From 01bf7afa92c94bb5d21aad7f2b2e7b15b5b548db Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 17 Oct 2025 19:57:12 -0700
Subject: [PATCH 2/3] nit

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/env_override.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/env_override.py b/vllm/env_override.py
index b20b901ff24d..8c51614af6f0 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -93,8 +93,9 @@ def get_output_names(graph_outputs) -> list[str]:
 # ===================================================
 # torch 2.9 Inductor get_graph_partition_signature monkeypatch
 # ===================================================
-# This change monkeypatches memory_plan_reuse in pytorch 2.9.0 to work around
-# a test failure for `tests/compile/test_fusions_e2e.py -sv -k test_attn_quant`.
+# This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to
+# work around a test failure for
+# `tests/compile/test_fusions_e2e.py::test_attn_quant`.
 # For more context, see https://github.com/pytorch/pytorch/pull/165815.
 
 

From cfc0e585fffddfa4ec84ac7e445d83bd8baaeb94 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Sat, 18 Oct 2025 10:14:38 -0700
Subject: [PATCH 3/3] nit

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/test_fusions_e2e.py | 11 +++++++----
 vllm/env_override.py              |  2 +-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py
index 24f9a874195b..7399abaec542 100644
--- a/tests/compile/test_fusions_e2e.py
+++ b/tests/compile/test_fusions_e2e.py
@@ -116,7 +116,7 @@ def test_attn_quant(
     allreduce_fusions: int,
     custom_ops: str,
     inductor_graph_partition: bool,
-    caplog_vllm,
+    caplog_mp_spawn,
     monkeypatch,
 ):
     if backend == _Backend.FLASHINFER and (
@@ -157,11 +157,14 @@ def test_attn_quant(
         inductor_compile_config={"force_disable_caches": True},
     )
 
-    with caplog_vllm.at_level(logging.DEBUG):
+    with caplog_mp_spawn(logging.DEBUG) as log_holder:
         run_model(compilation_config, model_name, **model_kwargs)
 
-    matches = re.findall(r"Fused quant onto (\d+) attention nodes", caplog_vllm.text)
-    assert len(matches) == 1, caplog_vllm.text
+    matches = re.findall(
+        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
+        log_holder.text,
+    )
+    assert len(matches) == 1, log_holder.text
     assert int(matches[0]) == attention_fusions
 
 
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 8c51614af6f0..8b6eecd6c842 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -94,7 +94,7 @@ def get_output_names(graph_outputs) -> list[str]:
 # torch 2.9 Inductor get_graph_partition_signature monkeypatch
 # ===================================================
 # This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to
-# work around a test failure for
+# fix inductor partition + attention-nvfp4 quant fusion, tested in
 # `tests/compile/test_fusions_e2e.py::test_attn_quant`.
 # For more context, see https://github.com/pytorch/pytorch/pull/165815.