Subclass attn metadata for cross-decoder layers to propagate logits_indices

sarckk · sarckk · commit 4d19b7b6d033 · 2025-07-28T11:01:11.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/tests/v1/e2e/test_kv_sharing_truncated_prefill.py b/tests/v1/e2e/test_kv_sharing_truncated_prefill.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import gc
+import random
+from typing import Optional, Union
+
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig, CompilationLevel
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration
+from vllm.model_executor.models.registry import ModelRegistry
+from vllm.sequence import IntermediateTensors
+
+from ...utils import fork_new_process_for_each_test
+
+
+class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, **kwargs)
+        attn_metadata = get_forward_context().attn_metadata
+        # attn_metadata is None during dummy runs
+        if attn_metadata is not None:
+            assert isinstance(attn_metadata, dict) # true in V1
+            # Layer 20 is a cross-decoder layer in YOCO
+            layer_attn_metadata = attn_metadata['model.language_model.layers.20.self_attn.attn']
+            if hasattr(layer_attn_metadata, 'logits_indices_padded'):
+                # This field is only set when
+                # enable_kv_sharing_truncated_prefill is set to True 
+                assert self.cache_config.enable_kv_sharing_truncated_prefill
+                logits_indices_padded = (
+                    layer_attn_metadata.logits_indices_padded
+                )
+                assert logits_indices_padded is not None
+                num_logits_indices = layer_attn_metadata.num_logits_indices
+                assert num_logits_indices > 0
+
+                logits_hs =  hidden_states[logits_indices_padded]
+                hidden_states = torch.randn_like(hidden_states)
+                gen_indices = logits_indices_padded[:num_logits_indices]
+                # Only set logits for logits_indices to valid values 
+                hidden_states[gen_indices] = logits_hs[:num_logits_indices]
+
+        return hidden_states
+
+@pytest.fixture
+def test_prompts():
+    """
+    Adapted from tests/v1/e2e/test_spec_decode.py
+    """
+    prompt_types = ["repeat", "sentence"]
+    # Setting higher num prompts increases the chance of numerics mismatch
+    # due to matrix multiplication numerics depending on batch dimension
+    num_prompts = 10
+    prompts = []
+
+    random.seed(0)
+    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+
+    for kind in random_prompt_type_choices:
+        word_choices = ["test", "temp", "hello", "where"]
+        word = random.choice(word_choices)
+        if kind == "repeat":
+            prompt = f"""please repeat the word '{word}' 10 times."""
+        elif kind == "sentence":
+            prompt = f"""please give a ten-word sentence that
+            uses the word {word} at least once."""
+        else:
+            raise ValueError(f"Unknown prompt type: {kind}")
+        prompts.append(prompt)
+
+    return prompts
+
+
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("enforce_eager", [True, False])
+def test_kv_sharing_truncated_prefill(
+    monkeypatch: pytest.MonkeyPatch,
+    enforce_eager: bool,
+    test_prompts: list[str],
+):
+    ModelRegistry.register_model("Gemma3nForConditionalGeneration", TestGemma3nForConditionalGeneration)
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+    compilation_config = CompilationConfig(
+        # This allows vLLM compilation backend to handle allocating and 
+        # managing buffers for cudagraph
+        cudagraph_copy_inputs=True,
+        level=CompilationLevel.
+        PIECEWISE if not enforce_eager else CompilationLevel.NO_COMPILATION)
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(
+            model="google/gemma-3n-E2B-it",
+            enforce_eager=enforce_eager,
+            compilation_config=compilation_config,
+        )
+        ref_responses = llm.generate(test_prompts, sampling_params)
+
+        del llm
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        llm = LLM(model="google/gemma-3n-E2B-it",
+                  enforce_eager=enforce_eager,
+                  compilation_config=compilation_config,
+                  enable_kv_sharing_truncated_prefill=True)
+        optimized_responses = llm.generate(test_prompts, sampling_params)
+
+        misses = 0
+
+        for ref_response, optimized_response in zip(ref_responses,
+                                                    optimized_responses):
+            if ref_response.outputs[0].text != optimized_response.outputs[
+                    0].text:
+                misses += 1
+
+        assert misses == 0
diff --git a/vllm/config.py b/vllm/config.py
@@ -1684,6 +1684,10 @@ class CacheConfig:
     num_cpu_blocks: Optional[int] = field(default=None, init=False)
     """The number of blocks to allocate for CPU memory."""
 
+    enable_kv_sharing_truncated_prefill: bool = False
+    """Skip prefill for tokens where applicable in YOCO-like KV-sharing 
+    setups (e.g. Gemma3n)"""
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -438,6 +438,9 @@ class EngineArgs:
     # DEPRECATED
     enable_prompt_adapter: bool = False
 
+    enable_kv_sharing_truncated_prefill: bool = \
+        CacheConfig.enable_kv_sharing_truncated_prefill
+
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
@@ -686,6 +689,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                  **cache_kwargs["cpu_offload_gb"])
         cache_group.add_argument("--calculate-kv-scales",
                                  **cache_kwargs["calculate_kv_scales"])
+        cache_group.add_argument(
+            "--enable-kv-sharing-truncated-prefill",
+            **cache_kwargs["enable_kv_sharing_truncated_prefill"])
 
         # Multimodal related configs
         multimodal_kwargs = get_kwargs(MultiModalConfig)
@@ -1056,6 +1062,8 @@ def create_engine_config(
             prefix_caching_hash_algo=self.prefix_caching_hash_algo,
             cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
+            enable_kv_sharing_truncated_prefill=self.
+            enable_kv_sharing_truncated_prefill,
         )
 
         # Get the current placement group if Ray is initialized and
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -193,6 +193,7 @@ def __init__(
         override_pooler_config: Optional[PoolerConfig] = None,
         compilation_config: Optional[Union[int, dict[str, Any],
                                            CompilationConfig]] = None,
+        enable_kv_sharing_truncated_prefill: bool = False,
         **kwargs,
     ) -> None:
         """LLM constructor."""
@@ -266,6 +267,8 @@ def __init__(
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
             compilation_config=compilation_config_instance,
+            enable_kv_sharing_truncated_prefill=\
+                enable_kv_sharing_truncated_prefill,
             **kwargs,
         )
 
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -143,7 +143,6 @@
     VLLM_USE_CUDNN_PREFILL: bool = False
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
     VLLM_LOOPBACK_IP: str = ""
-    VLLM_COMPUTE_PADDED_LOGITS_INDICES: bool = False
 
 
 def get_default_cache_root():
@@ -992,10 +991,6 @@ def get_vllm_port() -> Optional[int]:
     # The default value is "VLLM".
     "VLLM_PROCESS_NAME_PREFIX":
     lambda: os.getenv("VLLM_PROCESS_NAME_PREFIX", "VLLM"),
-
-    # Enable computing and propagating cudagraph padded logits indices
-    "VLLM_COMPUTE_PADDED_LOGITS_INDICES":
-    lambda: bool(int(os.getenv("VLLM_COMPUTE_PADDED_LOGITS_INDICES", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -95,7 +95,6 @@ class ForwardContext:
     # set dynamically for each forward pass
     dp_metadata: Optional[DPMetadata] = None
     skip_cuda_graphs: bool = False
-    logits_indices_padded: Optional[torch.Tensor] = None
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -117,7 +116,6 @@ def set_forward_context(
     num_tokens: Optional[int] = None,
     num_tokens_across_dp: Optional[torch.Tensor] = None,
     skip_cuda_graphs: bool = False,
-    logits_indices_padded: Optional[torch.Tensor] = None,
 ):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
@@ -143,7 +141,6 @@ def set_forward_context(
         attn_metadata=attn_metadata,
         dp_metadata=dp_metadata,
         skip_cuda_graphs=skip_cuda_graphs,
-        logits_indices_padded=logits_indices_padded,
     )
 
     try:
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
@@ -581,6 +581,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             lambda prefix: Gemma3nDecoderLayer(
                 config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
+
+        first_kv_shared_layer_idx = (config.num_hidden_layers -
+                                     config.num_kv_shared_layers)
+        # Layer idx 0-19 are self-decoder layers in You Only Cache Once (YOCO)
+        self.self_decoder_layers = self.layers[:first_kv_shared_layer_idx]
+        # Layer idx 20-34 are cross-decoder layers in YOCO
+        # Refer to YOCO paper https://arxiv.org/abs/2405.05254
+        self.cross_decoder_layers = self.layers[first_kv_shared_layer_idx:]
+
         self.norm = RMSNorm(
             config.hidden_size,
             eps=config.rms_norm_eps,
@@ -646,7 +655,17 @@ def forward(
         hidden_states = torch.stack(hidden_states, dim=0)
 
         # Transformer blocks.
-        for layer_idx, layer in enumerate(self.layers):
+        for layer_idx, layer in enumerate(self.self_decoder_layers):
+            # [altup_num_inputs, num_tokens, hidden_size]
+            hidden_states = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                per_layer_input=per_layer_inputs[:, layer_idx, :],
+                **kwargs,
+            )
+
+        for layer_idx, layer in enumerate(self.cross_decoder_layers,
+                                          start=len(self.self_decoder_layers)):
             # [altup_num_inputs, num_tokens, hidden_size]
             hidden_states = layer(
                 positions=positions,
@@ -771,6 +790,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         del lora_config  # Unused.
         super().__init__()
         self.config = config
+        self.cache_config = vllm_config.cache_config
         self.model = Gemma3nModel(vllm_config=vllm_config,
                                   prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -3,8 +3,8 @@
 import abc
 import functools
 from abc import abstractmethod
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, ClassVar, Generic, Optional, TypeVar
+from dataclasses import dataclass, make_dataclass
+from typing import TYPE_CHECKING, Any, ClassVar, Generic, Optional, TypeVar
 
 import numpy as np
 import torch
@@ -501,3 +501,16 @@ def reorder_batch_to_split_decodes_and_prefills(
         modified_batch = True
 
     return modified_batch
+
+
+def subclass_attention_metadata(
+    name_prefix: str,
+    metadata_cls: Any,
+    fields: list[tuple[str, Any, Any]],
+) -> Any:
+    """
+    Return a new subclass of `metadata_cls` with additional fields
+    """
+    name: str = name_prefix + metadata_cls.__name__  # type: ignore
+    Wrapped = make_dataclass(name, fields, bases=(metadata_cls, ))
+    return Wrapped
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py