See title; consolidate so that we only compile a class once

Lucaskabela · Lucaskabela · commit 0364a68db2de · 2025-09-25T16:09:46.000-07:00
Signed-off-by: Lucas Kabela &lt;lucaskabela@meta.com&gt;
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -474,7 +474,6 @@ def configure_post_pass(self):
         inductor_config[PASS_KEY] = self.post_grad_pass_manager
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
-
         vllm_config = self.vllm_config
         if not self.compilation_config.cache_dir:
             # no provided cache dir, generate one based on the known factors
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
@@ -192,7 +192,6 @@ def _support_torch_compile(
     # make sure super().__init__ is called on the base class
     #  other than TorchCompileWrapperWithCustomDispatcher
     cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
-
     old_init = cls.__init__
 
     setattr(cls, IGNORE_COMPILE_KEY, False)
@@ -222,20 +221,33 @@ def __init__(self, **kwargs):
             return
 
         compilation_counter.num_models_seen += 1
-        TorchCompileWrapperWithCustomDispatcher.__init__(
-            self, compilation_level=vllm_config.compilation_config.level)
+        if not hasattr(self.__class__, "compiled_callable"):
+            print(f"init self for {self.__class__}")
+            # only compile the same model once
+            # NOTE: this is probably not right, since parameters can change
+            # and cause us to fall over
+            TorchCompileWrapperWithCustomDispatcher.__init__(
+                self, compilation_level=vllm_config.compilation_config.level)
+            self.__class__.compiled_callable = self.compiled_callable
+        else:
+            print("init reusing the callable")
+            TorchCompileWrapperWithCustomDispatcher.__init__(
+                self,
+                self.__class__.compiled_callable,
+                compilation_level=vllm_config.compilation_config.level)
 
     cls.__init__ = __init__
 
     def __call__(self, *args, **kwargs):
+        print(f"Call to {self.__class__} forward")
         # torch.compiler.is_compiling() means we are inside the compilation
         # e.g. TPU has the compilation logic in model runner, so we don't
         # need to compile the model inside.
         if self.do_not_compile or torch.compiler.is_compiling():
             return self.forward(*args, **kwargs)
 
         # the first compilation needs to have dynamic shapes marked
-        if len(self.compiled_codes) < 1:
+        if len(self.__class__.compiled_codes) < 1:
             sig = inspect.signature(self.__class__.forward)
             bound_args = sig.bind(self, *args, **kwargs)
             bound_args.apply_defaults()
@@ -269,7 +281,8 @@ def __call__(self, *args, **kwargs):
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,
         # with the overhead of guard evaluation and recompilation.
-        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
+        if len(self.__class__.compiled_codes
+               ) < 1 or not self.use_custom_dispatcher:
             # it seems Dynamo reuse the compilation across instances,
             # while we need to make sure the compiled code is not reused.
             # we need to control all the compilation of the model.
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
@@ -53,7 +53,7 @@ def __init__(self,
 
         self.compiled_callable = compiled_callable
         self.original_code_object = self.__class__.forward.__code__
-        self.compiled_codes: list[CodeType] = []
+        self.__class__.compiled_codes = []  # type: ignore[attr-defined]
         torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
 
         # read the env var to determine whether to use the custom dispatcher
@@ -91,7 +91,8 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
         if frame.f_locals["self"] is not self:
             return
 
-        self.compiled_codes.append(new_code)
+        self.__class__.compiled_codes.append(  # type: ignore[attr-defined]
+            new_code)
         debug_dump_dir = self.vllm_config.compilation_config.debug_dump_path
         if isinstance(debug_dump_dir, str) and debug_dump_dir != "":
             rank = self.vllm_config.parallel_config.rank
@@ -131,6 +132,7 @@ def dispatch_to_code(self, index: int):
 
         See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
         """ # noqa
-        self.__class__.forward.__code__ = self.compiled_codes[index]
+        self.__class__.forward.__code__ = self.__class__.compiled_codes[  # type: ignore[attr-defined]
+            index]
         yield
         self.__class__.forward.__code__ = self.original_code_object
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
@@ -1048,6 +1048,7 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 
 class Gemma3nForCausalLM(nn.Module):
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
@@ -518,10 +518,10 @@ def forward(
         return x
 
 
-@set_model_tag("Qwen2_5_VisionPatchEmbed")
-@support_torch_compile(dynamic_arg_dims={
-    "x": 0,
-})
+# @set_model_tag("Qwen2_5_VisionPatchEmbed")
+# @support_torch_compile(dynamic_arg_dims={
+#     "x": 0,
+# })
 class Qwen2_5_VisionPatchEmbed(nn.Module):
 
     def __init__(
@@ -551,10 +551,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-@set_model_tag("Qwen2_5_VisionPatchMerger")
-@support_torch_compile(dynamic_arg_dims={
-    "x": 0,
-})
+# @set_model_tag("Qwen2_5_VisionPatchMerger")
+# @support_torch_compile(dynamic_arg_dims={
+#     "x": 0,
+# })
 class Qwen2_5_VisionPatchMerger(nn.Module):
 
     def __init__(