From d675f9d19645c2167352b4c3be57cd8160b13509 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Fri, 17 Oct 2025 19:45:05 -0700 Subject: [PATCH 1/3] fix graph partition signature Signed-off-by: Boyuan Feng --- tests/compile/test_fusions_e2e.py | 11 +-- vllm/env_override.py | 150 ++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 7 deletions(-) diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py index 7399abaec542..24f9a874195b 100644 --- a/tests/compile/test_fusions_e2e.py +++ b/tests/compile/test_fusions_e2e.py @@ -116,7 +116,7 @@ def test_attn_quant( allreduce_fusions: int, custom_ops: str, inductor_graph_partition: bool, - caplog_mp_spawn, + caplog_vllm, monkeypatch, ): if backend == _Backend.FLASHINFER and ( @@ -157,14 +157,11 @@ def test_attn_quant( inductor_compile_config={"force_disable_caches": True}, ) - with caplog_mp_spawn(logging.DEBUG) as log_holder: + with caplog_vllm.at_level(logging.DEBUG): run_model(compilation_config, model_name, **model_kwargs) - matches = re.findall( - r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes", - log_holder.text, - ) - assert len(matches) == 1, log_holder.text + matches = re.findall(r"Fused quant onto (\d+) attention nodes", caplog_vllm.text) + assert len(matches) == 1, caplog_vllm.text assert int(matches[0]) == attention_fusions diff --git a/vllm/env_override.py b/vllm/env_override.py index f4ac48584cb7..b20b901ff24d 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -90,6 +90,155 @@ def get_output_names(graph_outputs) -> list[str]: assert len(planning_states) == 0 +# =================================================== +# torch 2.9 Inductor get_graph_partition_signature monkeypatch +# =================================================== +# This change monkeypatches memory_plan_reuse in pytorch 2.9.0 to work around +# a test failure for `tests/compile/test_fusions_e2e.py -sv -k test_attn_quant`. +# For more context, see https://github.com/pytorch/pytorch/pull/165815. + + +def get_graph_partition_signature_patched( + self, partitions, skip_cudagraphs: list[bool] +): + """ + Gets signature for each graph partition, including input nodes, output nodes, and + whether deallocating an input within graph partition. + """ + from torch._inductor import dependencies + from torch._inductor.ir import GraphPartitionSignature, MutationOutput, NoneLayout + from torch._inductor.virtualized import V + from torch.utils._ordered_set import OrderedSet + + signatures = [] + + unmet_output_names = OrderedSet(V.graph.get_output_names()) + name_to_node = self.get_name_to_nodes() + + def is_none_layout(buf_name: str) -> bool: + """ + Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated + so graph partition should not take it as inputs or outputs. + """ + buf = self.name_to_buf.get(buf_name, None) + + if buf is None: + return False + + if isinstance(buf.node.layout, NoneLayout): + if isinstance(buf.node, MutationOutput) and ( + real_name := self.mutation_real_name.get(buf_name, None) + ): + return is_none_layout(real_name) + + return True + + return False + + for partition, skip_cudagraph in zip( + reversed(partitions), reversed(skip_cudagraphs) + ): + output_names: OrderedSet[str] = OrderedSet() + + for node in partition: + output_names.update(node.outputs_by_name.keys()) + + returned_output_names = output_names.intersection(unmet_output_names) + + # all reads/writes are partition inputs except those generated + # within the partition and tensor constants + read_writes = dependencies.ReadWrites.merge_list( + [node.read_writes for node in partition] + ) + + # WeakDep is fake dependency on unused buffer. It should not appear + # in partition_input_names for inputs that are actually read or written. + partition_input_names = ( + OrderedSet( + [ + x.name + for x in read_writes.reads | read_writes.writes + if not is_none_layout(x.name) + ] + ) + - output_names + ) + + partition_input_names = OrderedSet( + self.mutation_real_name.get(name, name) for name in partition_input_names + ) + + buffer_names_to_free: OrderedSet[str] = OrderedSet() + for node in partition: + buffer_names_to_free.update(node.last_usage) + + # buffer_names_to_free may contain buffers allocated in previous + # graph partitions. These buffers should also be a partition + # input. + extra_input_names = [ + name + for name in (buffer_names_to_free - output_names) + if name in name_to_node + ] + partition_input_names.update(extra_input_names) + + input_nodes = { + name: name_to_node[name] + for name in partition_input_names + if name in name_to_node + } + input_deallocation = { + name: name in buffer_names_to_free + for name in partition_input_names + if name in name_to_node + } + + # if an input tensor is not freed in the partition function, it should + # also be returned as an output. This brings benefits to cudagraph + # since the returned output tensor is a cudagraph managed tensor with + # a static tensor address. + extra_output_names = [ + name + for name in partition_input_names + if name in name_to_node and name not in buffer_names_to_free + ] + + returned_output_names.update(extra_output_names) + + returned_output_names = OrderedSet( + self.mutation_real_name.get(name, name) for name in returned_output_names + ) + + output_nodes = [ + name_to_node[name] + for name in returned_output_names + if not is_none_layout(name) + ] + + constant_names = [ + name for name in partition_input_names if name in V.graph.constants + ] + + symbol_inputs = self.get_graph_partition_symbol_inputs(partition, input_nodes) + + partition_signature = GraphPartitionSignature( + symbol_inputs, + input_nodes, + output_nodes, + input_deallocation, + skip_cudagraph, + constant_names, + ) + + signatures.append(partition_signature) + + unmet_output_names = partition_input_names.union( + unmet_output_names - returned_output_names + ) + + return signatures[::-1] + + # ======================================== # torch 2.9 Inductor Scheduler monkeypatch # ======================================== @@ -196,6 +345,7 @@ def _update_scheduler_patched(self) -> None: from torch._inductor.scheduler import Scheduler Scheduler.should_partition = should_partition_patched + Scheduler.get_graph_partition_signature = get_graph_partition_signature_patched with config.patch("triton.store_cubin", False): self.scheduler = Scheduler(self.operations) From 01bf7afa92c94bb5d21aad7f2b2e7b15b5b548db Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Fri, 17 Oct 2025 19:57:12 -0700 Subject: [PATCH 2/3] nit Signed-off-by: Boyuan Feng --- vllm/env_override.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/env_override.py b/vllm/env_override.py index b20b901ff24d..8c51614af6f0 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -93,8 +93,9 @@ def get_output_names(graph_outputs) -> list[str]: # =================================================== # torch 2.9 Inductor get_graph_partition_signature monkeypatch # =================================================== -# This change monkeypatches memory_plan_reuse in pytorch 2.9.0 to work around -# a test failure for `tests/compile/test_fusions_e2e.py -sv -k test_attn_quant`. +# This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to +# work around a test failure for +# `tests/compile/test_fusions_e2e.py::test_attn_quant`. # For more context, see https://github.com/pytorch/pytorch/pull/165815. From cfc0e585fffddfa4ec84ac7e445d83bd8baaeb94 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Sat, 18 Oct 2025 10:14:38 -0700 Subject: [PATCH 3/3] nit Signed-off-by: Boyuan Feng --- tests/compile/test_fusions_e2e.py | 11 +++++++---- vllm/env_override.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py index 24f9a874195b..7399abaec542 100644 --- a/tests/compile/test_fusions_e2e.py +++ b/tests/compile/test_fusions_e2e.py @@ -116,7 +116,7 @@ def test_attn_quant( allreduce_fusions: int, custom_ops: str, inductor_graph_partition: bool, - caplog_vllm, + caplog_mp_spawn, monkeypatch, ): if backend == _Backend.FLASHINFER and ( @@ -157,11 +157,14 @@ def test_attn_quant( inductor_compile_config={"force_disable_caches": True}, ) - with caplog_vllm.at_level(logging.DEBUG): + with caplog_mp_spawn(logging.DEBUG) as log_holder: run_model(compilation_config, model_name, **model_kwargs) - matches = re.findall(r"Fused quant onto (\d+) attention nodes", caplog_vllm.text) - assert len(matches) == 1, caplog_vllm.text + matches = re.findall( + r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes", + log_holder.text, + ) + assert len(matches) == 1, log_holder.text assert int(matches[0]) == attention_fusions diff --git a/vllm/env_override.py b/vllm/env_override.py index 8c51614af6f0..8b6eecd6c842 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -94,7 +94,7 @@ def get_output_names(graph_outputs) -> list[str]: # torch 2.9 Inductor get_graph_partition_signature monkeypatch # =================================================== # This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to -# work around a test failure for +# fix inductor partition + attention-nvfp4 quant fusion, tested in # `tests/compile/test_fusions_e2e.py::test_attn_quant`. # For more context, see https://github.com/pytorch/pytorch/pull/165815.