| 
 | 1 | +import dataclasses  | 
 | 2 | +from typing import List, Tuple, Type  | 
 | 3 | + | 
 | 4 | +import torch  | 
 | 5 | + | 
 | 6 | +from vllm.attention import AttentionMetadata  | 
 | 7 | +from vllm.attention.backends.abstract import AttentionBackend  | 
 | 8 | +from vllm.model_executor import SamplingMetadata  | 
 | 9 | +from vllm.model_executor.pooling_metadata import PoolingMetadata  | 
 | 10 | +from vllm.worker.embedding_model_runner import (  | 
 | 11 | +    ModelInputForGPUWithPoolingMetadata)  | 
 | 12 | +from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata  | 
 | 13 | + | 
 | 14 | + | 
 | 15 | +class MockAttentionBackend(AttentionBackend):  | 
 | 16 | + | 
 | 17 | +    @staticmethod  | 
 | 18 | +    def get_name() -> str:  | 
 | 19 | +        raise NotImplementedError  | 
 | 20 | + | 
 | 21 | +    @staticmethod  | 
 | 22 | +    def get_impl_cls():  | 
 | 23 | +        raise NotImplementedError  | 
 | 24 | + | 
 | 25 | +    @staticmethod  | 
 | 26 | +    def get_metadata_cls() -> Type["AttentionMetadata"]:  | 
 | 27 | +        return AttentionMetadata  | 
 | 28 | + | 
 | 29 | +    @staticmethod  | 
 | 30 | +    def get_kv_cache_shape(  | 
 | 31 | +        num_blocks: int,  | 
 | 32 | +        block_size: int,  | 
 | 33 | +        num_kv_heads: int,  | 
 | 34 | +        head_size: int,  | 
 | 35 | +    ) -> Tuple[int, ...]:  | 
 | 36 | +        raise NotImplementedError  | 
 | 37 | + | 
 | 38 | +    @staticmethod  | 
 | 39 | +    def swap_blocks(  | 
 | 40 | +        src_kv_cache: torch.Tensor,  | 
 | 41 | +        dst_kv_cache: torch.Tensor,  | 
 | 42 | +        src_to_dst: torch.Tensor,  | 
 | 43 | +    ) -> None:  | 
 | 44 | +        pass  | 
 | 45 | + | 
 | 46 | +    @staticmethod  | 
 | 47 | +    def copy_blocks(  | 
 | 48 | +        kv_caches: List[torch.Tensor],  | 
 | 49 | +        src_to_dists: torch.Tensor,  | 
 | 50 | +    ) -> None:  | 
 | 51 | +        pass  | 
 | 52 | + | 
 | 53 | + | 
 | 54 | +def test_model_runner_input():  | 
 | 55 | +    sampling_metadata = SamplingMetadata(  | 
 | 56 | +        ["seq_group"],  | 
 | 57 | +        "selected_token_indices",  | 
 | 58 | +        "categorized_sample_indices",  | 
 | 59 | +        "num_prompts",  | 
 | 60 | +    )  | 
 | 61 | +    attn_metadata = AttentionMetadata(  | 
 | 62 | +        num_prefills=1,  | 
 | 63 | +        num_prefill_tokens=2,  | 
 | 64 | +        num_decode_tokens=3,  | 
 | 65 | +        slot_mapping=torch.zeros(1),  | 
 | 66 | +    )  | 
 | 67 | +    model_input = ModelInputForGPUWithSamplingMetadata(  | 
 | 68 | +        input_tokens=torch.ones(10),  | 
 | 69 | +        input_positions=torch.ones(10),  | 
 | 70 | +        sampling_metadata=sampling_metadata,  | 
 | 71 | +        attn_metadata=attn_metadata)  | 
 | 72 | + | 
 | 73 | +    assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)  | 
 | 74 | + | 
 | 75 | +    # Test round trip serialization.  | 
 | 76 | +    tensor_dict = model_input.as_broadcastable_tensor_dict()  | 
 | 77 | +    attn_backend = MockAttentionBackend()  | 
 | 78 | +    received_model_input = (  | 
 | 79 | +        ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(  | 
 | 80 | +            tensor_dict, attn_backend=attn_backend))  | 
 | 81 | +    # Check that received copy has correct values.  | 
 | 82 | +    assert isinstance(received_model_input,  | 
 | 83 | +                      ModelInputForGPUWithSamplingMetadata)  | 
 | 84 | +    assert received_model_input.input_tokens is not None  | 
 | 85 | +    assert (  | 
 | 86 | +        received_model_input.input_tokens == model_input.input_tokens).all()  | 
 | 87 | +    assert received_model_input.input_positions is not None  | 
 | 88 | +    assert (received_model_input.input_positions == model_input.input_positions  | 
 | 89 | +            ).all()  | 
 | 90 | +    assert received_model_input.multi_modal_kwargs is None  | 
 | 91 | +    assert (received_model_input.multi_modal_kwargs ==  | 
 | 92 | +            model_input.multi_modal_kwargs)  | 
 | 93 | +    assert received_model_input.lora_requests is None  | 
 | 94 | +    assert received_model_input.lora_requests == model_input.lora_requests  | 
 | 95 | +    assert received_model_input.lora_mapping is None  | 
 | 96 | +    assert received_model_input.lora_mapping == model_input.lora_mapping  | 
 | 97 | +    for field in dataclasses.fields(AttentionMetadata):  | 
 | 98 | +        assert getattr(received_model_input.attn_metadata, field.name,  | 
 | 99 | +                       None) == getattr(attn_metadata, field.name, None)  | 
 | 100 | +    # For sampling metadata, only selected_token_indices is copied.  | 
 | 101 | +    assert (received_model_input.sampling_metadata.selected_token_indices ==  | 
 | 102 | +            sampling_metadata.selected_token_indices)  | 
 | 103 | +    assert received_model_input.sampling_metadata.seq_groups is None  | 
 | 104 | + | 
 | 105 | + | 
 | 106 | +def test_embedding_model_runner_input():  | 
 | 107 | +    pooling_metadata = PoolingMetadata(  | 
 | 108 | +        seq_groups=[[0]],  | 
 | 109 | +        seq_data={},  | 
 | 110 | +        prompt_lens=[1],  | 
 | 111 | +    )  | 
 | 112 | +    attn_metadata = AttentionMetadata(  | 
 | 113 | +        num_prefills=1,  | 
 | 114 | +        num_prefill_tokens=2,  | 
 | 115 | +        num_decode_tokens=3,  | 
 | 116 | +        slot_mapping=torch.zeros(1),  | 
 | 117 | +    )  | 
 | 118 | +    model_input = ModelInputForGPUWithPoolingMetadata(  | 
 | 119 | +        input_tokens=torch.ones(10),  | 
 | 120 | +        input_positions=torch.ones(10),  | 
 | 121 | +        pooling_metadata=pooling_metadata,  | 
 | 122 | +        attn_metadata=attn_metadata)  | 
 | 123 | + | 
 | 124 | +    assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)  | 
 | 125 | + | 
 | 126 | +    # Test round trip serialization.  | 
 | 127 | +    tensor_dict = model_input.as_broadcastable_tensor_dict()  | 
 | 128 | +    attn_backend = MockAttentionBackend()  | 
 | 129 | +    received_model_input = (  | 
 | 130 | +        ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(  | 
 | 131 | +            tensor_dict, attn_backend=attn_backend))  | 
 | 132 | +    # Check that received copy has correct values.  | 
 | 133 | +    assert isinstance(received_model_input,  | 
 | 134 | +                      ModelInputForGPUWithPoolingMetadata)  | 
 | 135 | +    assert received_model_input.input_tokens is not None  | 
 | 136 | +    assert (  | 
 | 137 | +        received_model_input.input_tokens == model_input.input_tokens).all()  | 
 | 138 | +    assert received_model_input.input_positions is not None  | 
 | 139 | +    assert (received_model_input.input_positions == model_input.input_positions  | 
 | 140 | +            ).all()  | 
 | 141 | +    assert received_model_input.multi_modal_kwargs is None  | 
 | 142 | +    assert (received_model_input.multi_modal_kwargs ==  | 
 | 143 | +            model_input.multi_modal_kwargs)  | 
 | 144 | +    assert received_model_input.lora_requests is None  | 
 | 145 | +    assert received_model_input.lora_requests == model_input.lora_requests  | 
 | 146 | +    assert received_model_input.lora_mapping is None  | 
 | 147 | +    assert received_model_input.lora_mapping == model_input.lora_mapping  | 
 | 148 | +    for field in dataclasses.fields(AttentionMetadata):  | 
 | 149 | +        assert getattr(received_model_input.attn_metadata, field.name,  | 
 | 150 | +                       None) == getattr(attn_metadata, field.name, None)  | 
 | 151 | +    # Pooling metadata is not broadcast.  | 
 | 152 | +    assert received_model_input.pooling_metadata is None  | 
0 commit comments