Enable use_inductor_graph_partition by default in >=2.9. Also fix tests that would require it to be False by default

ProExpertProg · ProExpertProg · commit 66ab1b0ebab2 · 2025-10-02T17:49:57.000Z
Signed-off-by: ProExpertProg &lt;lgovedic@redhat.com&gt;
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
@@ -174,6 +174,7 @@ def run_model(vllm_config: VllmConfig, model: nn.Module, inputs: torch.Tensor,
         return output.cpu()
 
 
+# TODO use_inductor_cg_partition both true and false
 def test_multi_graph_piecewise_compile_outputs_equal():
     outputs = []
 
@@ -212,7 +213,9 @@ def test_multi_graph_piecewise_compile_outputs_equal():
 
     # no compile or cudagraph
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.NO_COMPILATION, ))
+        level=CompilationLevel.NO_COMPILATION,
+        use_inductor_graph_partition=False,
+    ))
     cudagraph_runtime_mode = CUDAGraphMode.NONE
 
     with set_current_vllm_config(vllm_config):
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
@@ -256,7 +256,9 @@ def run_model(llama_config,
         cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
     else:
         compilation_config = CompilationConfig(
-            level=CompilationLevel.NO_COMPILATION, )
+            level=CompilationLevel.NO_COMPILATION,
+            use_inductor_graph_partition=False, # TODO try both?
+        )
         cudagraph_runtime_mode = CUDAGraphMode.NONE
 
     vllm_config = VllmConfig(compilation_config=compilation_config,
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
@@ -57,6 +57,7 @@ def test_ignore_torch_compile_decorator():
         use_cudagraph=True,
         splitting_ops=["silly.attention"],
         cudagraph_capture_sizes=[1, 2],
+        use_inductor_graph_partition=False, # TODO test both
     ))
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
@@ -182,6 +183,7 @@ def test_conditional_compile_enable_if():
                                  use_cudagraph=True,
                                  splitting_ops=["silly.attention"],
                                  cudagraph_capture_sizes=[1, 2],
+                                 use_inductor_graph_partition=False, # TODO test both
                              ))
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -311,7 +311,7 @@ class CompilationConfig:
     FULL_AND_PIECEWISE instead.
     """
 
-    use_inductor_graph_partition: bool = False
+    use_inductor_graph_partition: bool = is_torch_equal_or_newer("2.9.0")
     """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
     This partition happens at inductor codegen time after all passes and fusions
     are finished. It generates a single `call` function which wraps