diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
new file mode 100644
index 000000000000..ec8d0ae13eaa
--- /dev/null
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file contains ops for ViT attention to be compatible with torch.compile
+as there are operations here not supported by torch.compile (for instance,
+`to_list` in xformers attn, or `.item()` in flash attention)
+
+Using these ops and wrapping vision blocks with `torch.compile` can speed up
+throughput in vision models by ~5% relative on H100, and improve token 
+latencies by ~7% (see qwen2_5_vl for example usage)
+
+To use these ops, you must have a recent version of PyTorch installed (>= 2.4.0)
+"""
+import torch
+from einops import rearrange
+
+from vllm.utils import direct_register_custom_op
+
+
+def xformers_attn_seqlens_wrapper(q: torch.Tensor, k: torch.Tensor,
+                                  v: torch.Tensor,
+                                  seqlens: torch.Tensor) -> torch.Tensor:
+    from xformers import ops as xops
+    from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+    attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens.tolist(),
+                                               kv_seqlen=None,
+                                               device=q.device)
+    context_layer = xops.memory_efficient_attention_forward(
+        q, k, v, attn_bias=attn_bias, p=0, scale=None)
+    context_layer = rearrange(context_layer,
+                              "b s h d -> s b (h d)").contiguous()
+    return context_layer
+
+
+def xformers_attn_seqlens_wrapper_fake(q: torch.Tensor, k: torch.Tensor,
+                                       v: torch.Tensor,
+                                       seqlens: torch.Tensor) -> torch.Tensor:
+    b, s, h, d = q.shape
+    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
+
+
+direct_register_custom_op(
+    op_name="xformers_attn_seqlens_wrapper",
+    op_func=xformers_attn_seqlens_wrapper,
+    fake_impl=xformers_attn_seqlens_wrapper_fake,
+)
+
+
+def vit_xformers_attn_wrapper(q: torch.Tensor, k: torch.Tensor,
+                              v: torch.Tensor,
+                              seqlens: torch.Tensor) -> torch.Tensor:
+    return torch.ops.vllm.xformers_attn_seqlens_wrapper(q, k, v, seqlens)
+
+
+def flash_attn_maxseqlen_wrapper(q: torch.Tensor, k: torch.Tensor,
+                                 v: torch.Tensor, cu_seqlens: torch.Tensor,
+                                 max_seqlen: torch.Tensor, batch_size: int,
+                                 is_rocm_aiter: bool,
+                                 use_upstream_fa: bool) -> torch.Tensor:
+    if is_rocm_aiter:
+        from aiter import flash_attn_varlen_func
+    else:
+        if use_upstream_fa:
+            from flash_attn import flash_attn_varlen_func
+        else:
+            from vllm.vllm_flash_attn import flash_attn_varlen_func
+    q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+    output = flash_attn_varlen_func(q,
+                                    k,
+                                    v,
+                                    cu_seqlens_q=cu_seqlens,
+                                    cu_seqlens_k=cu_seqlens,
+                                    max_seqlen_q=max_seqlen.item(),
+                                    max_seqlen_k=max_seqlen.item(),
+                                    dropout_p=0.0,
+                                    causal=False)
+    context_layer = rearrange(output, "(b s) h d -> s b (h d)",
+                              b=batch_size).contiguous()
+    return context_layer
+
+
+def flash_attn_maxseqlen_wrapper_fake(q: torch.Tensor, k: torch.Tensor,
+                                      v: torch.Tensor,
+                                      cu_seqlens: torch.Tensor,
+                                      max_seqlen: torch.Tensor,
+                                      batch_size: int, is_rocm_aiter: bool,
+                                      use_upstream_fa: bool) -> torch.Tensor:
+    b, s, h, d = q.shape
+    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
+
+
+direct_register_custom_op(
+    op_name="flash_attn_maxseqlen_wrapper",
+    op_func=flash_attn_maxseqlen_wrapper,
+    fake_impl=flash_attn_maxseqlen_wrapper_fake,
+)
+
+
+def vit_flash_attn_wrapper(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                           cu_seqlens: torch.Tensor, max_seqlen: torch.Tensor,
+                           batch_size: int, is_rocm_aiter: bool,
+                           use_upstream_fa: bool) -> torch.Tensor:
+    return torch.ops.vllm.flash_attn_maxseqlen_wrapper(q, k, v, cu_seqlens,
+                                                       max_seqlen, batch_size,
+                                                       is_rocm_aiter,
+                                                       use_upstream_fa)
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 335bbda5e4eb..7a0119dd3b61 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -474,7 +474,6 @@ def configure_post_pass(self):
         inductor_config[PASS_KEY] = self.post_grad_pass_manager
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
-
         vllm_config = self.vllm_config
         if not self.compilation_config.cache_dir:
             # no provided cache dir, generate one based on the known factors
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index fa38cfe49a91..b1ca05bec824 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -13,7 +13,7 @@
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationLevel, VllmConfig
+from vllm.config import CompilationLevel, VllmConfig, get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils import resolve_obj_by_qualname, supports_dynamo
@@ -192,13 +192,22 @@ def _support_torch_compile(
     # make sure super().__init__ is called on the base class
     #  other than TorchCompileWrapperWithCustomDispatcher
     cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
-
     old_init = cls.__init__
 
     setattr(cls, IGNORE_COMPILE_KEY, False)
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
-        old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
+    def __init__(self, **kwargs):
+        # NOTE: to support multimodal models (such as encoder),
+        # we may not have vllm_config so we only want to pass
+        # vllm_config when it is available.
+        sig = inspect.signature(old_init)
+        if 'vllm_config' in sig.parameters:
+            vllm_config = kwargs['vllm_config']
+        else:
+            vllm_config = get_current_vllm_config()
+
+        old_init(self, **kwargs)
+
         self.vllm_config = vllm_config
         enable_compile = enable_if is None or enable_if(vllm_config)
         # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
@@ -212,12 +221,25 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
             return
 
         compilation_counter.num_models_seen += 1
-        TorchCompileWrapperWithCustomDispatcher.__init__(
-            self, compilation_level=vllm_config.compilation_config.level)
+        if not hasattr(self.__class__, "compiled_callable"):
+            print(f"init self for {self.__class__}")
+            # only compile the same model once
+            # NOTE: this is probably not right, since parameters can change
+            # and cause us to fall over
+            TorchCompileWrapperWithCustomDispatcher.__init__(
+                self, compilation_level=vllm_config.compilation_config.level)
+            self.__class__.compiled_callable = self.compiled_callable
+        else:
+            print("init reusing the callable")
+            TorchCompileWrapperWithCustomDispatcher.__init__(
+                self,
+                self.__class__.compiled_callable,
+                compilation_level=vllm_config.compilation_config.level)
 
     cls.__init__ = __init__
 
     def __call__(self, *args, **kwargs):
+        print(f"Call to {self.__class__} forward")
         # torch.compiler.is_compiling() means we are inside the compilation
         # e.g. TPU has the compilation logic in model runner, so we don't
         # need to compile the model inside.
@@ -225,7 +247,7 @@ def __call__(self, *args, **kwargs):
             return self.forward(*args, **kwargs)
 
         # the first compilation needs to have dynamic shapes marked
-        if len(self.compiled_codes) < 1:
+        if len(self.__class__.compiled_codes) < 1:
             sig = inspect.signature(self.__class__.forward)
             bound_args = sig.bind(self, *args, **kwargs)
             bound_args.apply_defaults()
@@ -259,7 +281,8 @@ def __call__(self, *args, **kwargs):
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,
         # with the overhead of guard evaluation and recompilation.
-        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
+        if len(self.__class__.compiled_codes
+               ) < 1 or not self.use_custom_dispatcher:
             # it seems Dynamo reuse the compilation across instances,
             # while we need to make sure the compiled code is not reused.
             # we need to control all the compilation of the model.
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 062c9dc27017..3a5732197587 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -53,7 +53,7 @@ def __init__(self,
 
         self.compiled_callable = compiled_callable
         self.original_code_object = self.__class__.forward.__code__
-        self.compiled_codes: list[CodeType] = []
+        self.__class__.compiled_codes = []  # type: ignore[attr-defined]
         torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
 
         # read the env var to determine whether to use the custom dispatcher
@@ -91,8 +91,8 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
         if frame.f_locals["self"] is not self:
             return
 
-        self.compiled_codes.append(new_code)
-
+        self.__class__.compiled_codes.append(  # type: ignore[attr-defined]
+            new_code)
         path = self.vllm_config.compile_debug_dump_path()
         if path:
             decompiled_file = path / "transformed_code.py"
@@ -130,6 +130,7 @@ def dispatch_to_code(self, index: int):
 
         See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
         """ # noqa
-        self.__class__.forward.__code__ = self.compiled_codes[index]
+        self.__class__.forward.__code__ = self.__class__.compiled_codes[  # type: ignore[attr-defined]
+            index]
         yield
         self.__class__.forward.__code__ = self.original_code_object
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 0b6bccb33498..aa862b670f71 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -1048,6 +1048,7 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 
 class Gemma3nForCausalLM(nn.Module):
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index da3889d31a7d..7775966f2780 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -39,9 +39,14 @@
     Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
 
 from vllm.attention.layer import check_upstream_fa_availability
+from vllm.attention.ops.vit_attn_wrappers import (vit_flash_attn_wrapper,
+                                                  vit_xformers_attn_wrapper)
+from vllm.compilation.backends import set_model_tag
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -335,8 +340,8 @@ def forward(
             x: torch.Tensor,
             cu_seqlens: torch.Tensor,
             rotary_pos_emb: torch.Tensor,
-            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+            max_seqlen: torch.Tensor,  # Only used for Flash Attention
+            seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -354,29 +359,10 @@ def forward(
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         if self.is_flash_attn_backend:
-            if self.attn_backend == _Backend.ROCM_AITER_FA:
-                from aiter import flash_attn_varlen_func
-            else:
-                if self.use_upstream_fa:
-                    from flash_attn import flash_attn_varlen_func
-                else:
-                    from vllm.vllm_flash_attn import flash_attn_varlen_func
-
-            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-
-            output = flash_attn_varlen_func(q,
-                                            k,
-                                            v,
-                                            cu_seqlens_q=cu_seqlens,
-                                            cu_seqlens_k=cu_seqlens,
-                                            max_seqlen_q=max_seqlen,
-                                            max_seqlen_k=max_seqlen,
-                                            dropout_p=0.0,
-                                            causal=False)
-
-            context_layer = rearrange(output,
-                                      "(b s) h d -> s b (h d)",
-                                      b=batch_size).contiguous()
+            context_layer = vit_flash_attn_wrapper(
+                q, k, v, cu_seqlens, max_seqlen, batch_size,
+                self.attn_backend == _Backend.ROCM_AITER_FA,
+                self.use_upstream_fa)
         elif self.attn_backend == _Backend.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             outputs = []
@@ -398,22 +384,19 @@ def forward(
             context_layer = rearrange(context_layer,
                                       "b s h d -> s b (h d)").contiguous()
         elif self.attn_backend == _Backend.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
-                                                       kv_seqlen=None,
-                                                       device=q.device)
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None)
-            context_layer = rearrange(context_layer,
-                                      "b s h d -> s b (h d)").contiguous()
+            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
 
         output, _ = self.proj(context_layer)
         return output
 
 
+@set_model_tag("Qwen2_5_VisionBlock")
+@support_torch_compile(dynamic_arg_dims={
+    "x": 0,
+    "cu_seqlens": 0,
+    "rotary_pos_emb": 0,
+    "seqlens": 0,
+})
 class Qwen2_5_VisionBlock(nn.Module):
 
     def __init__(
@@ -456,8 +439,8 @@ def forward(
             x: torch.Tensor,
             cu_seqlens: torch.Tensor,
             rotary_pos_emb: torch.Tensor,
-            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+            max_seqlen: torch.Tensor,  # Only used for Flash Attention
+            seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(self.norm1(x),
                            cu_seqlens=cu_seqlens,
@@ -469,6 +452,10 @@ def forward(
         return x
 
 
+# @set_model_tag("Qwen2_5_VisionPatchEmbed")
+# @support_torch_compile(dynamic_arg_dims={
+#     "x": 0,
+# })
 class Qwen2_5_VisionPatchEmbed(nn.Module):
 
     def __init__(
@@ -498,6 +485,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
+# @set_model_tag("Qwen2_5_VisionPatchMerger")
+# @support_torch_compile(dynamic_arg_dims={
+#     "x": 0,
+# })
 class Qwen2_5_VisionPatchMerger(nn.Module):
 
     def __init__(
@@ -731,13 +722,15 @@ def get_rope_by_thw(self, t, h, w):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[Optional[int], Optional[list[int]]]:
-        max_seqlen, seqlens = None, None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        max_seqlen, seqlens = torch.zeros(
+            1, device=cu_seqlens.device), torch.zeros(1,
+                                                      device=cu_seqlens.device)
         if (self.attn_backend == _Backend.FLASH_ATTN
                 or self.attn_backend == _Backend.ROCM_AITER_FA):
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
         elif self.attn_backend == _Backend.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1])
         return max_seqlen, seqlens
 
     @staticmethod
@@ -1001,6 +994,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
         self.config = config
+        self.vllm_config = vllm_config
         self.multimodal_config = multimodal_config
         self.video_pruning_rate = multimodal_config.video_pruning_rate
         self.is_multimodal_pruning_enabled = (
@@ -1009,7 +1003,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if multimodal_config.get_limit_per_prompt("image") or \
             multimodal_config.get_limit_per_prompt("video"):
             self.visual = Qwen2_5_VisionTransformer(
-                config.vision_config,
+                vision_config=config.vision_config,
                 norm_eps=getattr(config, "rms_norm_eps", 1e-6),
                 quant_config=self.quant_config,
                 prefix=maybe_prefix(prefix, "visual"),
@@ -1127,15 +1121,16 @@ def _process_image_input(
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"]
-
-            if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(self.visual,
-                                                         pixel_values,
-                                                         grid_thw_list,
-                                                         rope_type="rope_3d")
-            else:
-                image_embeds = self.visual(pixel_values,
-                                           grid_thw=grid_thw_list)
+            with set_forward_context(None, self.vllm_config):
+                if self.use_data_parallel:
+                    return run_dp_sharded_mrope_vision_model(
+                        self.visual,
+                        pixel_values,
+                        grid_thw_list,
+                        rope_type="rope_3d")
+                else:
+                    image_embeds = self.visual(pixel_values,
+                                               grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each image item.
         # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
@@ -1187,14 +1182,16 @@ def _process_video_input(
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
-            if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(self.visual,
-                                                         pixel_values_videos,
-                                                         grid_thw_list,
-                                                         rope_type="rope_3d")
-            else:
-                video_embeds = self.visual(pixel_values_videos,
-                                           grid_thw=grid_thw_list)
+            with set_forward_context(None, self.vllm_config):
+                if self.use_data_parallel:
+                    return run_dp_sharded_mrope_vision_model(
+                        self.visual,
+                        pixel_values_videos,
+                        grid_thw_list,
+                        rope_type="rope_3d")
+                else:
+                    video_embeds = self.visual(pixel_values_videos,
+                                               grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size