From 99eddb06a0b5396a478af7c80d8035d17dd4efed Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 2 Jul 2025 22:11:08 -0700 Subject: [PATCH 01/13] [V0 deprecation] Remove V0 CPU/XPU/TPU backends Signed-off-by: Woosuk Kwon --- vllm/attention/backends/pallas.py | 356 ---------- vllm/platforms/tpu.py | 51 +- vllm/worker/cpu_enc_dec_model_runner.py | 326 --------- vllm/worker/cpu_model_runner.py | 671 ----------------- vllm/worker/cpu_pooling_model_runner.py | 125 ---- vllm/worker/cpu_worker.py | 450 ------------ vllm/worker/multi_step_tpu_worker.py | 108 --- vllm/worker/tpu_model_runner.py | 909 ------------------------ vllm/worker/tpu_worker.py | 337 --------- vllm/worker/xpu_model_runner.py | 606 ---------------- vllm/worker/xpu_worker.py | 186 ----- 11 files changed, 16 insertions(+), 4109 deletions(-) delete mode 100644 vllm/attention/backends/pallas.py delete mode 100644 vllm/worker/cpu_enc_dec_model_runner.py delete mode 100644 vllm/worker/cpu_model_runner.py delete mode 100644 vllm/worker/cpu_pooling_model_runner.py delete mode 100644 vllm/worker/cpu_worker.py delete mode 100644 vllm/worker/multi_step_tpu_worker.py delete mode 100644 vllm/worker/tpu_model_runner.py delete mode 100644 vllm/worker/tpu_worker.py delete mode 100644 vllm/worker/xpu_model_runner.py delete mode 100644 vllm/worker/xpu_worker.py diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py deleted file mode 100644 index c900666955a3..000000000000 --- a/vllm/attention/backends/pallas.py +++ /dev/null @@ -1,356 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch -import torch_xla.experimental.custom_kernel # Required to register custom ops. - -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.utils import CommonAttentionState -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -class PallasAttentionBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "PALLAS" - - @staticmethod - def get_impl_cls() -> Type["PallasAttentionBackendImpl"]: - return PallasAttentionBackendImpl - - @staticmethod - def get_metadata_cls() -> Type["PallasMetadata"]: - return PallasMetadata - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return (num_kv_heads, num_blocks, block_size, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - raise RuntimeError("swap_blocks is not used for the TPU backend.") - - @torch.compile(backend="openxla") - @staticmethod - def copy_blocks( - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - src_to_dists: Tuple[torch.Tensor, torch.Tensor], - ) -> None: - src_indices, dst_indices = src_to_dists - for k_cache, v_cache in kv_caches: - torch.ops.xla.dynamo_set_buffer_donor_(k_cache, True) - k_cache[:, dst_indices] = k_cache[:, src_indices] - torch.ops.xla.dynamo_set_buffer_donor_(v_cache, True) - v_cache[:, dst_indices] = v_cache[:, src_indices] - - -@dataclass -class PallasMetadata(AttentionMetadata): - - # Currently, input sequences can only contain all prefills - # or all decoding. - block_tables: Optional[torch.Tensor] = None - context_lens: Optional[torch.Tensor] = None - effective_query_lens: Optional[torch.Tensor] = None - - @property - def prefill_metadata(self) -> Optional["PallasMetadata"]: - if self.num_prefills == 0: - return None - - assert self.num_decode_tokens == 0 - return self - - @property - def decode_metadata(self) -> Optional["PallasMetadata"]: - if self.num_decode_tokens == 0: - return None - - assert self.num_prefills == 0 - assert self.num_prefill_tokens == 0 - assert self.block_tables is not None - assert self.context_lens is not None - return self - - -class PallasAttentionBackendImpl(AttentionImpl): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0.") - if use_irope: - logger.warning_once( - "Using irope in Pallas is not supported yet, it will fall back " - "to global attention for long context.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.logits_soft_cap = logits_soft_cap - if head_size % 128 != 0: - raise NotImplementedError( - f"Head size must be a multiple of 128, found {head_size}.") - if alibi_slopes is not None: - raise NotImplementedError("Alibi slopes is not supported.") - if sliding_window is not None: - raise NotImplementedError("Sliding window is not supported.") - if is_quantized_kv_cache(kv_cache_dtype): - raise NotImplementedError("FP8 KV cache dtype is not supported.") - if blocksparse_params is not None: - raise NotImplementedError("Blocksparse is not supported.") - - if torch_xla.tpu.version() < 4: - raise NotImplementedError("TPU version must be 4 or higher.") - - self.megacore_mode = None - tpu_env = torch_xla.tpu.get_tpu_env() - tpu_type = (tpu_env.get("ACCELERATOR_TYPE", None) - or tpu_env.get("TYPE", None) - or tpu_env.get("TPU_ACCELERATOR_TYPE", None)) - assert tpu_type is not None - tpu_type = tpu_type.lower() - - if (("lite" not in tpu_type) and ("v6" not in tpu_type)): - if self.num_kv_heads % 2 == 0: - self.megacore_mode = "kv_head" - else: - # NOTE(woosuk): If the batch size is not a multiple of 2, the - # megacore mode will be None. - self.megacore_mode = "batch" - - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "PallasAttentionBackendImpl") - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: Tuple[torch.Tensor, torch.Tensor], - attn_metadata: PallasMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with Pallas attention. - - Args: - query: shape = [batch_size, seq_len, num_heads * head_size] - key: shape = [batch_size, seq_len, num_kv_heads * head_size] - value: shape = [batch_size, seq_len, num_kv_heads * head_size] - kv_cache[0] = [num_kv_heads, num_blocks, block_size, head_size] - kv_cache[1] = [num_kv_heads, num_blocks, block_size, head_size] - NOTE: kv_cache[0] and kv_cache[1] will be an empty tensor - with shape [0] for profiling run. - attn_metadata: Metadata for attention. - Returns: - shape = [batch_size, seq_len, num_heads * head_size] - """ - if output_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for PallasAttentionImpl") - - assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 - batch_size, seq_len, hidden_size = query.shape - query = query.view(batch_size, seq_len, self.num_heads, self.head_size) - key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size) - value = value.view(batch_size, seq_len, self.num_kv_heads, - self.head_size) - - if kv_cache[0].numel() > 0: - slot_mapping = attn_metadata.slot_mapping - key_cache, value_cache = kv_cache - write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping) - - query = query * self.scale - if attn_metadata.num_prefills > 0: - if attn_metadata.block_tables is None: - # Prefill without paged KV cache. - assert seq_len % 16 == 0, ( - "Pallas FlashAttention kernel requires seq_len to be a " - f"multiple of 16 but got {seq_len}") - - # Handle GQA/MQA. - if self.num_kv_heads != self.num_heads: - key = key.repeat_interleave(self.num_queries_per_kv, - dim=-2) - key = key.view(batch_size, seq_len, self.num_heads, - self.head_size) - value = value.repeat_interleave(self.num_queries_per_kv, - dim=-2) - value = value.view(batch_size, seq_len, self.num_heads, - self.head_size) - # FlashAttention kernel requires the input shape to be - # [batch_size, num_heads, seq_len, d_model] - # while the input is [batch_size, seq_len, num_heads, d_model]. - # Permute the input to match the required format. - output = torch.ops.xla.flash_attention( - query.permute(0, 2, 1, 3), - key.permute(0, 2, 1, 3), - value.permute(0, 2, 1, 3), - True, - ) - output = output.permute(0, 2, 1, 3) - else: - # Prefill with paged KV cache. - # TODO(woosuk): Tune the below knobs. - num_kv_pages_per_compute_block = 16 - num_queries_per_compute_block = 16 - assert seq_len % num_queries_per_compute_block == 0 - output = torch.ops.xla.multi_queries_paged_attention( - query, - key_cache, - value_cache, - attn_metadata.context_lens, - attn_metadata.block_tables, - attn_metadata.effective_query_lens, - num_kv_pages_per_compute_block, - num_queries_per_compute_block, - use_kernel=True, - attn_logits_soft_cap=self.logits_soft_cap, - ) - else: - # Decoding run. - assert kv_cache[0].numel() > 0 - query = query.squeeze(dim=1) - pages_per_compute_block = 16 # TODO(woosuk): Tune this value. - - assert attn_metadata.block_tables is not None - assert attn_metadata.context_lens is not None - # NOTE(woosuk): The PagedAttention Pallas kernel stores the entire - # block table in SMEM. Therefore, if the block table is too large, - # the kernel compilation will fail. To avoid this, we split the - # batch dimension into smaller chunks and run the kernel multiple - # times. - MAX_SMEM_USAGE = 512 * 1024 - size_per_seq = 4 * attn_metadata.block_tables.shape[1] - max_num_seq = MAX_SMEM_USAGE // size_per_seq - - if batch_size <= max_num_seq: - output = paged_attention( - query, - key_cache, - value_cache, - attn_metadata.context_lens, - attn_metadata.block_tables, - pages_per_compute_block, - self.megacore_mode, - attn_logits_soft_cap=self.logits_soft_cap, - ) - else: - chunk_size = max_num_seq - # Make sure the chunk size is a multiple of 2. - chunk_size = chunk_size // 2 * 2 - num_chunks = (batch_size + chunk_size - 1) // chunk_size - - output = torch.empty_like(query) - for chunk_idx in range(num_chunks): - chunk_start = chunk_idx * chunk_size - chunk_end = chunk_start + chunk_size - # NOTE(woosuk): We skip this line because it causes Dynamo - # compilation error. Instead, we rely on the slice operation - # to handle the out-of-bound case. - # chunk_end = min(chunk_end, batch_size) - chunk_output = paged_attention( - query[chunk_start:chunk_end], - key_cache, - value_cache, - attn_metadata.context_lens[chunk_start:chunk_end], - attn_metadata.block_tables[chunk_start:chunk_end], - pages_per_compute_block, - self.megacore_mode, - attn_logits_soft_cap=self.logits_soft_cap, - ) - output[chunk_start:chunk_end] = chunk_output - - # Reshape the output tensor. - return output.reshape(batch_size, seq_len, hidden_size) - - -def write_to_kv_cache( - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - slot_mapping: torch.Tensor, -) -> None: - torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True) - torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True) - - key = key.flatten(0, 2) - value = value.flatten(0, 2) - key_cache = key_cache.flatten(0, 2) - value_cache = value_cache.flatten(0, 2) - key_cache.index_copy_(0, slot_mapping, key) - value_cache.index_copy_(0, slot_mapping, value) - - -def paged_attention( - query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - context_lens: torch.Tensor, - block_tables: torch.Tensor, - pages_per_compute_block: int, - megacore_mode: Optional[str], - *, - attn_logits_soft_cap: Optional[float], -) -> torch.Tensor: - batch_size = query.shape[0] - if megacore_mode == "batch" and batch_size % 2 != 0: - megacore_mode = None - else: - megacore_mode = megacore_mode - - return torch.ops.xla.paged_attention( - query, - key_cache, - value_cache, - context_lens, - block_tables, - pages_per_compute_block, - megacore_mode=megacore_mode, - attn_logits_soft_cap=attn_logits_soft_cap, - ) diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 0387e348965d..a8c8cb46de2c 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -6,7 +6,6 @@ import torch from tpu_info import device -import vllm.envs as envs from vllm.inputs import ProcessorInputs, PromptType from vllm.logger import init_logger from vllm.sampling_params import SamplingParams, SamplingType @@ -50,12 +49,10 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, and selected_backend != _Backend.PALLAS_VLLM_V1): logger.info("Cannot use %s backend on TPU.", selected_backend) - if use_v1: - logger.info("Using Pallas V1 backend.") - return "vllm.v1.attention.backends.pallas.PallasAttentionBackend" - else: - logger.info("Using Pallas backend.") - return "vllm.attention.backends.pallas.PallasAttentionBackend" + if not use_v1: + raise ValueError("TPU backend only supports V1.") + logger.info("Using Pallas V1 backend.") + return "vllm.v1.attention.backends.pallas.PallasAttentionBackend" @classmethod def get_device_name(cls, device_id: int = 0) -> str: @@ -68,7 +65,7 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: @classmethod def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return not envs.VLLM_USE_V1 + return False @classmethod def get_punica_wrapper(cls) -> str: @@ -117,31 +114,19 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "Using bfloat16 instead.", vllm_config.model_config.dtype) vllm_config.model_config.dtype = torch.bfloat16 - if envs.VLLM_USE_V1: - from vllm.v1.attention.backends.pallas import ( - PallasAttentionBackend) - cache_config.block_size = PallasAttentionBackend.get_page_size( - vllm_config) # type: ignore[assignment] + from vllm.v1.attention.backends.pallas import PallasAttentionBackend + cache_config.block_size = PallasAttentionBackend.get_page_size( + vllm_config) # type: ignore[assignment] parallel_config = vllm_config.parallel_config scheduler_config = vllm_config.scheduler_config if parallel_config.worker_cls == "auto": if scheduler_config.is_multi_step: - if envs.VLLM_USE_V1: - raise NotImplementedError( - "Multi-step scheduling is not supported (and not " - "needed) on vLLM V1. Please launch without " - "--num-scheduler-steps.") - else: - parallel_config.worker_cls = \ - "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker" - else: - if envs.VLLM_USE_V1: - parallel_config.worker_cls = \ - "vllm.v1.worker.tpu_worker.TPUWorker" - else: - parallel_config.worker_cls = \ - "vllm.worker.tpu_worker.TPUWorker" + raise NotImplementedError( + "Multi-step scheduling is not supported (and not " + "needed) on vLLM V1. Please launch without " + "--num-scheduler-steps.") + parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TPUWorker" assert not vllm_config.speculative_config, ( "Speculative decoding is not yet supported for TPU backend") @@ -189,13 +174,9 @@ def validate_request( processed_inputs: ProcessorInputs, ) -> None: """Raises if this request is unsupported on this platform""" - if isinstance(params, SamplingParams): - if params.guided_decoding is not None and not envs.VLLM_USE_V1: - raise ValueError("Structured output is not supported on " - f"{cls.device_name} V0.") - if params.sampling_type == SamplingType.RANDOM_SEED: - raise ValueError( - "Torch XLA does not support per-request seed.") + if (isinstance(params, SamplingParams) + and params.sampling_type == SamplingType.RANDOM_SEED): + raise ValueError("Torch XLA does not support per-request seed.") try: diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py deleted file mode 100644 index c99e2652a397..000000000000 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ /dev/null @@ -1,326 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast - -import torch - -from vllm.attention import AttentionMetadata -from vllm.forward_context import set_forward_context -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MultiModalKwargs -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import make_tensor_with_pad -from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, - ModelInputForCPUBuilder, - ModelInputForCPUWithSamplingMetadata) -from vllm.worker.model_runner_base import ( - _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - - -@dataclasses.dataclass(frozen=True) -class EncoderDecoderModelInputForCPU(ModelInputForCPUWithSamplingMetadata): - """ - Used by the EncoderDecoderModelRunner. - """ - encoder_input_tokens: Optional[torch.Tensor] = None - encoder_input_positions: Optional[torch.Tensor] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - "encoder_input_tokens": self.encoder_input_tokens, - "encoder_input_positions": self.encoder_input_positions, - "multi_modal_kwargs": self.multi_modal_kwargs, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "EncoderDecoderModelInputForCPU": - return cast( - EncoderDecoderModelInputForCPU, - super().from_broadcasted_tensor_dict(tensor_dict, attn_backend)) - - -class CPUEncoderDecoderModelRunner( - CPUModelRunnerBase[EncoderDecoderModelInputForCPU]): - _model_input_cls: Type[EncoderDecoderModelInputForCPU] = ( - EncoderDecoderModelInputForCPU) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder - - def _list_to_int32_tensor( - self, - _list: List[int], - ) -> torch.Tensor: - return torch.tensor(_list, dtype=torch.int32, device=self.device) - - def _list_to_long_tensor( - self, - _list: List[int], - ) -> torch.Tensor: - return torch.tensor(_list, dtype=torch.long, device=self.device) - - def _empty_int32_tensor(self) -> torch.Tensor: - return self._list_to_int32_tensor([]) - - def _empty_long_tensor(self) -> torch.Tensor: - return self._list_to_long_tensor([]) - - def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, - Any]) -> EncoderDecoderModelInputForCPU: - return EncoderDecoderModelInputForCPU.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - ) - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> EncoderDecoderModelInputForCPU: - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - ( - attn_metadata, - encoder_input_tokens_tensor, - encoder_input_positions_tensor, - ) = self._prepare_encoder_model_input_tensors(seq_group_metadata_list, - model_input) - # Sampling metadata is only required for the final pp group - generators = self.get_generators(finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, - model_input.seq_lens, - model_input.query_lens, - self.device, - pin_memory=False, - generators=generators) - return dataclasses.replace( - model_input, - sampling_metadata=sampling_metadata, - attn_metadata=attn_metadata, - encoder_input_tokens=encoder_input_tokens_tensor, - encoder_input_positions=encoder_input_positions_tensor, - virtual_engine=virtual_engine, - ) - - def _prepare_encoder_model_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - model_input: EncoderDecoderModelInputForCPU, - ) -> Tuple[AttentionMetadata, Optional[torch.Tensor], - Optional[torch.Tensor]]: - """Helper method to prepare the encoder- and cross-attn-related - model inputs based on a given sequence group. These additional inputs - are used to augment an already-computed `EncoderDecoderModelInput` - data structure which already has decoder-related model inputs - populated. - - Sets the following attn_metadata fields: - * `num_encoder_tokens` - * `encoder_seq_lens` - * `encoder_seq_lens_tensor` - * `max_encoder_seq_len` - * `cross_slot_mapping` - * `cross_block_tables` - - Constructs a new model inputs data structure, based on - (1) the existing fields in the `model_inputs` argument, - and (2) the following additional fields which are - computed (or in the case of `attn_metadata`, updated) - by this function: - * attn_metadata - * encoder_input_tokens - * encoder_input_positions - - Arguments: - - * seq_group_metadata_list: list of sequence groups for which to - compute inputs - * model_inputs: model inputs data structure with decoder-oriented - fields already computed. - - Return: - - * Updated model inputs data structure - """ - - if len(seq_group_metadata_list) == 0: - return (model_input.attn_metadata, None, None) - - # Since we are not supporting chunked prefill either the entire - # batch is prefill or it is decode - is_prompt = seq_group_metadata_list[0].is_prompt - - # Build encoder inputs - encoder_seq_lens: List[int] = [] - if is_prompt: - # Prefill phase. - cross_block_tables = self._empty_int32_tensor().view( - len(seq_group_metadata_list), -1) - - # Extract input tokens/positions, cross-attention slot-mapping, - # & seq len from each sequence group metadata - ( - encoder_input_tokens, - encoder_input_positions, - cross_slot_mapping, - ) = ( - [], - [], - [], - ) - for seq_group_metadata in seq_group_metadata_list: - # Build seq lens - seq_len = seq_group_metadata.encoder_seq_data.get_len() - token_ids = seq_group_metadata.encoder_seq_data.get_token_ids() - encoder_seq_lens.append(seq_len) - - # Build slot mapping - for i in range(0, seq_len): - block_number = seq_group_metadata.cross_block_table[ - i // self.block_size] - block_offset = i % self.block_size - slot = block_number * self.block_size + block_offset - cross_slot_mapping.append(slot) - - # Build encoder input tokens - encoder_input_tokens.extend(token_ids) - encoder_input_positions.extend(list(range(0, seq_len))) - - # Convert tokens/positions & cross-attention - # slot-mapping to encoder input tensors - encoder_input_tokens_tensor = self._list_to_long_tensor( - encoder_input_tokens) - encoder_input_positions_tensor = self._list_to_long_tensor( - encoder_input_positions) - cross_slot_mapping_tensor = self._list_to_long_tensor( - cross_slot_mapping) - - else: - # Decode phase. - encoder_input_tokens_tensor = self._empty_long_tensor() - encoder_input_positions_tensor = self._empty_long_tensor() - cross_slot_mapping_tensor = self._empty_long_tensor() - # Extract cross-attention block tables & - # seq len from each sequence group metadata. - # Cross-attention block tables are empty - # during vLLM memory profiling. - cross_block_tables = [] - for seq_group_metadata in seq_group_metadata_list: - for _ in range(len(seq_group_metadata.seq_data)): - encoder_seq_lens.append( - seq_group_metadata.encoder_seq_data.get_len()) - cross_block_table = seq_group_metadata.cross_block_table - cross_block_tables.append([] if ( - cross_block_table is None) else cross_block_table) - - max_len_of_block_table = max( - len(block_table) for block_table in cross_block_tables) - - cross_block_tables = make_tensor_with_pad( - cross_block_tables, - max_len=max_len_of_block_table, - pad=0, - dtype=torch.int32, - device=self.device, - ) - - # Compute encoder sequence lengths & encoder - # sequence starting offset tensors - max_encoder_seq_len = max(encoder_seq_lens, default=0) - encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens) - encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + - 1, - dtype=torch.int32, - device=self.device) - torch.cumsum(encoder_seq_lens_tensor, - dim=0, - dtype=encoder_seq_start_loc.dtype, - out=encoder_seq_start_loc[1:]) - - # Update attention metadata with encoder-oriented attributes - attn_metadata = model_input.attn_metadata - assert attn_metadata is not None - ( - attn_metadata.num_encoder_tokens, - attn_metadata.encoder_seq_lens, - attn_metadata.encoder_seq_lens_tensor, - attn_metadata.max_encoder_seq_len, - attn_metadata.cross_slot_mapping, - attn_metadata.cross_block_tables, - ) = ( - sum(encoder_seq_lens), - encoder_seq_lens, - encoder_seq_lens_tensor, - max_encoder_seq_len, - cross_slot_mapping_tensor, - cross_block_tables, - ) - - return (attn_metadata, encoder_input_tokens_tensor, - encoder_input_positions_tensor) - - @torch.no_grad() - def execute_model( - self, - model_input: EncoderDecoderModelInputForCPU, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError( - "CPU worker does not support multi-step execution.") - - model_executable = self.model - execute_model_kwargs = { - "input_ids": - model_input.input_tokens, - "positions": - model_input.input_positions, - "encoder_input_ids": - model_input.encoder_input_tokens, - "encoder_positions": - model_input.encoder_input_positions, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - "intermediate_tensors": - intermediate_tensors, - } - - with set_forward_context(model_input.attn_metadata, self.vllm_config, - model_input.virtual_engine): - hidden_states = model_executable(**execute_model_kwargs) - - # Compute the logits. - logits = self.model.compute_logits(hidden_states, - model_input.sampling_metadata) - - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return [] - - # Sample the next token. - output = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - return [output] diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py deleted file mode 100644 index 68cdf65cafa7..000000000000 --- a/vllm/worker/cpu_model_runner.py +++ /dev/null @@ -1,671 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -import weakref -from collections import defaultdict -from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Type, - TypeVar, Union) - -import torch -from torch import nn - -from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.config import VllmConfig -from vllm.forward_context import set_forward_context -from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler -from vllm.model_executor.model_loader import get_model -from vllm.model_executor.models import supports_lora, supports_multimodal -from vllm.multimodal import (BatchedTensorInputs, MultiModalKwargs, - MultiModalPlaceholderMap) -from vllm.sequence import (IntermediateTensors, SequenceData, - SequenceGroupMetadata) -from vllm.worker.model_runner_base import ( - ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, - _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict, - _init_attn_metadata_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - -TModelInputForCPU = TypeVar('TModelInputForCPU', bound="ModelInputForCPU") -_PAD_SLOT_ID = -1 - - -@dataclass(frozen=True) -class ModelInputForCPU(ModelRunnerInputBase): - """ - Base class contains metadata needed for the base model forward pass on CPU - """ - input_tokens: Optional[torch.Tensor] = None - input_positions: Optional[torch.Tensor] = None - token_type_ids: Optional[torch.Tensor] = None - attn_metadata: Optional["AttentionMetadata"] = None - multi_modal_kwargs: Optional[BatchedTensorInputs] = None - virtual_engine: Optional[int] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None - lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[Set[LoRARequest]] = None - - def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - "token_type_ids": self.token_type_ids, - "multi_modal_kwargs": self.multi_modal_kwargs, - "lora_requests": self.lora_requests, - "lora_mapping": self.lora_mapping, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type[TModelInputForCPU], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None - ) -> TModelInputForCPU: - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -@dataclass(frozen=True) -class ModelInputForCPUWithSamplingMetadata(ModelInputForCPU): - """ - Used by the ModelRunner. - """ - sampling_metadata: Optional["SamplingMetadata"] = None - is_prompt: Optional[bool] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - "token_type_ids": self.token_type_ids, - "multi_modal_kwargs": self.multi_modal_kwargs, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForCPUWithSamplingMetadata": - tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]): - - class ModelInputData: - - def __init__(self, use_mrope: bool): - self.use_mrope = use_mrope - self.input_tokens: List[int] = [] - self.input_positions: List[int] = [] - self.token_type_ids: Optional[List[int]] = [] - self.seq_lens: List[int] = [] - self.query_lens: List[int] = [] - self.prefill_block_tables: List[List[int]] = [] - self.decode_block_tables: List[List[int]] = [] - self.max_decode_seq_len: int = 0 - self.num_prefills: int = 0 - self.num_prefill_tokens: int = 0 - self.num_decode_tokens: int = 0 - self.slot_mapping: List[int] = [] - self.multi_modal_inputs_list: List[MultiModalKwargs] = [] - self.multi_modal_placeholder_maps: Dict[ - str, MultiModalPlaceholderMap] = defaultdict( - MultiModalPlaceholderMap) - self.input_mrope_positions: List[List[int]] = [[] - for _ in range(3)] - - def __init__(self, - runner: "CPUModelRunner", - finished_requests_ids: Optional[List[str]] = None) -> None: - super().__init__() - self.runner = runner - self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled - or runner.cache_config.enable_prefix_caching) - self.model_input_cls = self.runner._model_input_cls - self.attn_backend = self.runner.attn_backend - self.sliding_window = self.runner.sliding_window - self.block_size = self.runner.block_size - self.device = self.runner.device - self.enable_lora = self.runner.lora_config is not None - if self.runner.attn_backend is not None: - # spec decode (e.g. Medusa) does not have atten backend - attn_backend = self.runner.attn_backend - self.att_metadata_builder = attn_backend.get_builder_cls()(self) - - def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] - self.input_data = ModelInputForCPUBuilder.ModelInputData( - self.runner.model_config.uses_mrope) - self.att_metadata_builder.prepare() - - def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): - self.seq_group_metadata_list.append(seq_group_metadata) - - def set_seq_group_list( - self, seq_group_metadata_list: List[SequenceGroupMetadata]): - self.seq_group_metadata_list = seq_group_metadata_list - - def build(self) -> ModelInputForCPU: - self._build_input_data() - - input_data = self.input_data - input_tokens = torch.tensor(input_data.input_tokens, - dtype=torch.long, - device="cpu") - input_positions = torch.tensor( - input_data.input_positions - if not any(input_data.input_mrope_positions) else - input_data.input_mrope_positions, - dtype=torch.long, - device="cpu") - token_type_ids = torch.tensor(input_data.token_type_ids, - dtype=torch.long, - device="cpu") \ - if input_data.token_type_ids else None - - # For multi-modal models - multi_modal_kwargs = None - if len(input_data.multi_modal_inputs_list) != 0: - multi_modal_kwargs = MultiModalKwargs.batch( - input_data.multi_modal_inputs_list) - - attn_metadata = self.att_metadata_builder.build( - input_data.seq_lens, input_data.query_lens, -1, -1) - - is_prompt = (self.seq_group_metadata_list[0].is_prompt - if self.seq_group_metadata_list else None) - # LoRA data. - lora_requests = set() - lora_mapping = None - if self.enable_lora: - lora_requests = set(seq.lora_request - for seq in self.seq_group_metadata_list - if seq.lora_request is not None) - - lora_mapping = self._prepare_lora_input( - self.seq_group_metadata_list, is_prompt) - - return self.model_input_cls(input_tokens=input_tokens, - input_positions=input_positions, - token_type_ids=token_type_ids, - seq_lens=input_data.seq_lens, - query_lens=input_data.query_lens, - attn_metadata=attn_metadata, - multi_modal_kwargs=multi_modal_kwargs, - lora_mapping=lora_mapping, - lora_requests=lora_requests) - - def _build_input_data(self): - for seq_group_metadata in self.seq_group_metadata_list: - for seq_id, seq_data in seq_group_metadata.seq_data.items(): - if seq_group_metadata.is_prompt: - self._compute_prompt_input_tokens(self.input_data, - seq_group_metadata, - seq_data, seq_id) - if seq_group_metadata.multi_modal_data: - self._compute_multi_modal_input( - seq_group_metadata, seq_data) - else: - self._compute_decode_input_tokens(self.input_data, - seq_group_metadata, - seq_data, seq_id) - - def _compute_decode_input_tokens(self, data: ModelInputData, - seq_group_metadata: SequenceGroupMetadata, - seq_data: SequenceData, seq_id: int): - """ - Compute decode input tokens, positions, block table and slot mapping. - """ - block_size = self.runner.block_size - - block_table = seq_group_metadata.block_tables[seq_id] - seq_len = seq_data.get_len() - context_len = seq_data.get_num_computed_tokens() - - tokens = seq_data.get_last_token_id() - token_positions = seq_len - 1 - block_number = block_table[token_positions // block_size] - block_offset = token_positions % block_size - slot = block_number * block_size + block_offset - - # For paged_attention kernel - if self.runner.sliding_window: - start_idx = max(0, seq_len - self.runner.sliding_window) - start_block = start_idx // block_size - start_idx = start_block * block_size - seq_len = seq_len - start_idx - block_table = block_table[start_block:] - - # For MRotaryEmbedding - if seq_data.mrope_position_delta is not None: - next_pos = MRotaryEmbedding.get_next_input_positions( - seq_data.mrope_position_delta, - context_len, - seq_len, - ) - for idx in range(3): - data.input_mrope_positions[idx].extend( # type: ignore - next_pos[idx]) - else: - data.input_positions.append(token_positions) # type: ignore - - # Update fields - data.input_tokens.append(tokens) - data.max_decode_seq_len = max(data.max_decode_seq_len, seq_len) - data.num_decode_tokens += 1 - data.slot_mapping.append(slot) - data.decode_block_tables.append(block_table) - data.query_lens.append(1) - data.seq_lens.append(seq_len) - - def _compute_prompt_input_tokens(self, data: ModelInputData, - seq_group_metadata: SequenceGroupMetadata, - seq_data: SequenceData, seq_id: int): - """ - Compute prompt input tokens, positions, block table and slot mapping. - """ - token_chunk_size = seq_group_metadata.token_chunk_size - block_size = self.runner.block_size - - block_table = seq_group_metadata.block_tables[seq_id] - seq_len = seq_data.get_len() - context_len = seq_data.get_num_computed_tokens() - seq_len = min(seq_len, context_len + token_chunk_size) - - # For prefix caching - prefix_cache_block_num = len(seq_group_metadata.computed_block_nums) - if prefix_cache_block_num > 0: - prefix_cache_len = (prefix_cache_block_num * - self.runner.block_size) - if prefix_cache_len <= context_len: - # We already passed the cache hit region, - # so do normal computation. - pass - elif context_len < prefix_cache_len < seq_len: - # Partial hit. Compute the missing part. - context_len = prefix_cache_len - token_chunk_size = seq_len - context_len - elif seq_len <= prefix_cache_len: - # Full hit. Only compute the last token to avoid - # erroneous behavior. FIXME: Ideally we should directly - # mark all tokens as computed in the scheduler and do not - # schedule this sequence, so this case should not happen. - context_len = seq_len - 1 - token_chunk_size = 1 - - tokens = seq_data.get_token_ids() - tokens = tokens[context_len:seq_len] - token_positions = range(context_len, seq_len) - token_types = seq_group_metadata.token_type_ids - - # For encoder-only models, the block_table is None, - # and there is no need to initialize the slot_mapping. - if block_table is not None: - slot_mapping = [_PAD_SLOT_ID] * len(token_positions) - for i, pos in enumerate(token_positions): - block_number = block_table[pos // block_size] - block_offset = pos % block_size - slot = block_number * block_size + block_offset - slot_mapping[i] = slot - data.slot_mapping.extend(slot_mapping) - - # The MROPE positions are prepared in _compute_multi_modal_input - data.input_positions.extend(token_positions) - - if data.token_type_ids is not None: - data.token_type_ids.extend(token_types if token_types else []) - - # Update fields - data.input_tokens.extend(tokens) - data.num_prefills += 1 - data.num_prefill_tokens += len(tokens) - data.query_lens.append(len(tokens)) - data.prefill_block_tables.append(block_table) - data.seq_lens.append(seq_len) - - def _compute_multi_modal_input(self, - seq_group_metadata: SequenceGroupMetadata, - seq_data: SequenceData): - computed_len = seq_data.get_num_computed_tokens() - seq_len = self.input_data.seq_lens[-1] - - # NOTE: mm_kwargs only includes the subset of multi-modal items that - # intersect with the current prefill positions. - mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( - seq_group_metadata, range(computed_len, seq_len)) - - if not mm_kwargs: - return - - # special processing for mrope position deltas. - if self.runner.model_config.uses_mrope: - assert not self.chunked_prefill, \ - "MROPE on CPU does not support chunked-prefill." - - image_grid_thw = mm_kwargs.get("image_grid_thw", None) - video_grid_thw = mm_kwargs.get("video_grid_thw", None) - audio_feature_lengths = mm_kwargs.get("audio_feature_lengths", - None) - assert ( - image_grid_thw is not None or video_grid_thw is not None - or audio_feature_lengths is not None), ( - "mrope embedding type requires multi-modal input mapper " - "returns 'image_grid_thw' or 'video_grid_thw' or " - "'audio_feature_lengths'.") - - second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) - use_audio_in_video = mm_kwargs.get("use_audio_in_video", False) - hf_config = self.runner.model_config.hf_config - token_ids = seq_data.get_token_ids() - - mrope_positions, mrope_position_delta = \ - MRotaryEmbedding.get_input_positions( - token_ids, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=computed_len, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) - seq_data.mrope_position_delta = mrope_position_delta - - for i in range(3): - self.input_data.input_mrope_positions[ # type: ignore - i].extend(mrope_positions[i]) - - self.input_data.multi_modal_inputs_list.append(mm_kwargs) - for modality, placeholder_map in placeholder_maps.items(): - self.input_data.multi_modal_placeholder_maps[modality].extend( - placeholder_map) - - def _prepare_lora_input( - self, seq_group_metadata_list: List[SequenceGroupMetadata], - is_prefill: bool) -> LoRAMapping: - index_mapping = [] - prompt_mapping = [] - for seq in seq_group_metadata_list: - lora_id = seq.lora_int_id - query_len = seq.token_chunk_size - - index_mapping += [lora_id] * query_len - prompt_mapping += [lora_id] * ( - query_len if seq.sampling_params - and seq.sampling_params.prompt_logprobs is not None else 1) - - return LoRAMapping(index_mapping=tuple(index_mapping), - prompt_mapping=tuple(prompt_mapping), - is_prefill=is_prefill) - - -class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): - """ - Helper class for shared methods between CPU model runners. - """ - _model_input_cls: Type[TModelInputForCPU] - _builder_cls: Type[ModelInputForCPUBuilder] - builder: ModelInputForCPUBuilder - - def __init__( - self, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - return_hidden_states: bool = False, - *args, - **kwargs, - ): - ModelRunnerBase.__init__(self, vllm_config) - model_config = self.model_config - cache_config = self.cache_config - - self.is_driver_worker = is_driver_worker - self.return_hidden_states = return_hidden_states - - self.device = self.device_config.device - self.pin_memory = False - - self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = model_config.get_sliding_window() - self.block_size = cache_config.block_size - num_attn_heads = self.model_config.get_num_attention_heads( - self.parallel_config) - needs_attn_backend = (num_attn_heads != 0 - or self.model_config.is_attention_free) - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - use_mla=self.model_config.use_mla, - ) if needs_attn_backend else None - - # Lazy initialization. - self.model: nn.Module # Set after init_Model - # Set after load_model. - self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None - self.sampler = get_sampler() - - if hasattr(self, "_builder_cls"): - # multi-step model runner does not have `_builder_cls` - self.builder = self._builder_cls(weakref.proxy(self)) - - def load_model(self) -> None: - self.model = get_model(vllm_config=self.vllm_config) - - if self.lora_config: - assert supports_lora( - self.model - ), f"{self.model.__class__.__name__} does not support LoRA yet." - - if supports_multimodal(self.model): - logger.warning("Regarding multimodal models, vLLM currently " - "only supports adding LoRA to language model.") - - # Use get_text_config() in case of multimodal models - text_config = self.model_config.hf_config.get_text_config() - - self.lora_manager = LRUCacheWorkerLoRAManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens, - self.vocab_size, - self.lora_config, - self.device, - self.model.embedding_modules, - self.model.embedding_padding_modules, - max_position_embeddings=text_config.max_position_embeddings, - ) - self.model = self.lora_manager.create_lora_manager(self.model) - - def get_model(self) -> nn.Module: - return self.model - - def _prepare_model_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - finished_requests_ids: Optional[List[str]] = None - ) -> TModelInputForCPU: - """Helper method to prepare the model input based on a given sequence - group. Prepares metadata needed for the base model forward pass but not - metadata for possible additional steps, e.g., sampling. - - """ - self.builder.prepare(finished_requests_ids) - self.builder.set_seq_group_list(seq_group_metadata_list) - - return self.builder.build() # type: ignore - - @property - def vocab_size(self) -> int: - return self.model_config.get_vocab_size() - - def remove_all_loras(self): - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.remove_all_adapters() - - def set_active_loras(self, lora_requests: Set[LoRARequest], - lora_mapping: LoRAMapping) -> None: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.add_adapter(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.remove_adapter(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.pin_adapter(lora_id) - - def list_loras(self) -> Set[int]: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.list_adapters() - - -class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): - _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( - ModelInputForCPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> ModelInputForCPUWithSamplingMetadata: - return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501 - tensor_dict, - attn_backend=self.attn_backend, - ) - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForCPUWithSamplingMetadata: - """Prepare the model input based on a given sequence group, including - metadata for the sampling step. - - """ - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - # Sampling metadata is only required for the final pp group - generators = self.get_generators(finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, - model_input.seq_lens, - model_input.query_lens, - self.device, - pin_memory=False, - generators=generators) - - is_prompt = (seq_group_metadata_list[0].is_prompt - if seq_group_metadata_list else None) - return dataclasses.replace(model_input, - sampling_metadata=sampling_metadata, - virtual_engine=virtual_engine, - is_prompt=is_prompt) - - @torch.no_grad() - def execute_model( - self, - model_input: ModelInputForCPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - previous_hidden_states: Optional[torch.Tensor] = None, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError( - "CPU worker does not support multi-step execution.") - - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - - model_executable = self.model - - multimodal_kwargs = {} - if model_input.multi_modal_kwargs is not None: - multimodal_kwargs = MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs, - device=self.device, - ) - execute_model_kwargs = {} - if previous_hidden_states is not None: - execute_model_kwargs.update( - {"previous_hidden_states": previous_hidden_states}) - - with set_forward_context(model_input.attn_metadata, self.vllm_config, - model_input.virtual_engine): - hidden_states = model_executable( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **execute_model_kwargs, - **multimodal_kwargs, - ) - - # Compute the logits. - logits = self.model.compute_logits(hidden_states, - model_input.sampling_metadata) - - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return [] - - # Sample the next token. - output = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - if self.return_hidden_states: - # we only need to pass hidden states of most recent token - if model_input.is_prompt: - output.prefill_hidden_states = hidden_states - output.hidden_states = hidden_states - return [output] - - def generate_proposals(self, *args, **kwargs): - return self.model.generate_proposals(*args, **kwargs) diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py deleted file mode 100644 index 203fdf225a41..000000000000 --- a/vllm/worker/cpu_pooling_model_runner.py +++ /dev/null @@ -1,125 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -from typing import Any, Dict, List, Optional, Tuple, Type, Union - -import torch - -from vllm.forward_context import set_forward_context -from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.multimodal import MultiModalKwargs -from vllm.pooling_params import PoolingParams -from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData, - SequenceGroupMetadata) -from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU, - ModelInputForCPUBuilder) - - -@dataclasses.dataclass(frozen=True) -class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU): - """ - Used by the CPUPoolingModelRunner. - """ - pooling_metadata: Optional["PoolingMetadata"] = None - - -class CPUPoolingModelRunner( - CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]): - _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = ( - ModelInputForCPUWithPoolingMetadata) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForCPUWithPoolingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: - if num_steps > 1: - raise ValueError( - "CPU worker does not support multi-step execution.") - - model_executable = self.model - cross_enc_kwargs = {} - if model_input.token_type_ids is not None: - cross_enc_kwargs["token_type_ids"] = model_input.token_type_ids - execute_model_kwargs = { - "input_ids": - model_input.input_tokens, - "positions": - model_input.input_positions, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - **cross_enc_kwargs, - "intermediate_tensors": - intermediate_tensors, - } - - with set_forward_context(model_input.attn_metadata, self.vllm_config, - model_input.virtual_engine): - hidden_states = model_executable(**execute_model_kwargs) - - # Only perform pooling in the driver worker. - if not self.is_driver_worker: - return [] - - return [ - self.model.pooler(hidden_states=hidden_states, - pooling_metadata=model_input.pooling_metadata) - ] - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, - Any]) -> ModelInputForCPUWithPoolingMetadata: - return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - ) - - def prepare_model_input( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForCPUWithPoolingMetadata: - assert seq_group_metadata_list is not None - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - # Prepare PoolingMetadata. - assert model_input.seq_lens is not None - pooling_metadata = self._prepare_pooling(seq_group_metadata_list, - model_input.seq_lens) - - return dataclasses.replace(model_input, - virtual_engine=virtual_engine, - pooling_metadata=pooling_metadata) - - def _prepare_pooling( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - prompt_lens: List[int], - ) -> PoolingMetadata: - """Prepare PoolingMetadata for the sequence group metadata list.""" - seq_groups: List[Tuple[List[int], PoolingParams]] = [] - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - seq_ids = list(seq_group_metadata.seq_data.keys()) - pooling_params = seq_group_metadata.pooling_params - seq_groups.append((seq_ids, pooling_params)) - - seq_data: Dict[int, SequenceData] = {} - for seq_group_metadata in seq_group_metadata_list: - seq_data.update(seq_group_metadata.seq_data) - - pooling_metadata = PoolingMetadata( - seq_groups=seq_groups, - seq_data=seq_data, - prompt_lens=prompt_lens, - ) - - return pooling_metadata diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py deleted file mode 100644 index ff110e050bb6..000000000000 --- a/vllm/worker/cpu_worker.py +++ /dev/null @@ -1,450 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A CPU worker class.""" -import os -from importlib import util -from typing import List, Optional, Set, Tuple, Type - -import torch -import torch.distributed - -import vllm.envs as envs -from vllm.attention import get_attn_backend -from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, VllmConfig) -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed -from vllm.sequence import ExecuteModelRequest -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache -from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner -from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase -from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, - WorkerInput) - -logger = init_logger(__name__) - - -class CPUCacheEngine: - """Manages the KV cache for CPU backend. - - This class is responsible for initializing and managing CPU KV - caches. It also provides methods for performing KV cache operations, such - as copying. - """ - - def __init__(self, cache_config: CacheConfig, model_config: ModelConfig, - parallel_config: ParallelConfig, - device_config: DeviceConfig) -> None: - assert device_config.device_type == "cpu" - self.cache_config = cache_config - self.model_config = model_config - self.parallel_config = parallel_config - - self.head_size = model_config.get_head_size() - self.num_layers = model_config.get_num_layers(parallel_config) - self.num_heads = model_config.get_num_kv_heads(parallel_config) - - self.block_size = cache_config.block_size - # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks - # for CPU backend, because we want to reuse KV cache management - # in the scheduler. - self.num_cpu_blocks = cache_config.num_gpu_blocks - - if cache_config.cache_dtype == "auto": - self.dtype = model_config.dtype - elif cache_config.cache_dtype in ["fp8", "fp8_e5m2"]: - self.dtype = torch.float8_e5m2 - else: - raise NotImplementedError(f"Unsupported KV cache type " - f"{cache_config.cache_dtype}.") - - # Get attention backend. - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - cache_config.cache_dtype, - self.block_size, - self.model_config.is_attention_free, - use_mla=self.model_config.use_mla, - ) - - # Initialize the cache. - self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks) - - def _allocate_kv_cache( - self, - num_blocks: int, - ) -> List[torch.Tensor]: - """Allocates KV cache on CPU.""" - kv_cache_shape = self.attn_backend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_heads, self.head_size) - kv_cache: List[torch.Tensor] = [] - for _ in range(self.num_layers): - kv_cache.append( - torch.empty(kv_cache_shape, dtype=self.dtype, device="cpu")) - return kv_cache - - def swap_in(self, src_to_dst: torch.Tensor) -> None: - raise NotImplementedError("Swap is not supported in CPUCacheEngine.") - - def swap_out(self, src_to_dst: torch.Tensor) -> None: - raise NotImplementedError("Swap is not supported in CPUCacheEngine.") - - def copy(self, src_to_dsts: torch.Tensor) -> None: - self.attn_backend.copy_blocks(self.cpu_cache, src_to_dsts) - - @staticmethod - def get_cache_block_size( - block_size: int, - cache_dtype: str, - model_config: ModelConfig, - parallel_config: ParallelConfig, - ) -> int: - head_size = model_config.get_head_size() - num_heads = model_config.get_num_kv_heads(parallel_config) - num_layers = model_config.get_num_layers(parallel_config) - - key_cache_block = block_size * num_heads * head_size - value_cache_block = key_cache_block if not model_config.use_mla else 0 - total = num_layers * (key_cache_block + value_cache_block) - if cache_dtype == "auto": - dtype = model_config.dtype - else: - dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] - dtype_size = torch.tensor([], dtype=dtype).element_size() - return dtype_size * total - - -class CPUWorker(LocalOrDistributedWorkerBase): - """A worker class that executes (a partition of) the model on a CPU socket. - - Each worker is associated with a single CPU socket. The worker is - responsible for maintaining the KV cache and executing the model on the - CPU. In case of distributed inference, each worker is assigned a partition - of the model. - """ - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - model_runner_cls: Optional[Type[CPUModelRunner]] = None, - ) -> None: - WorkerBase.__init__(self, vllm_config=vllm_config) - - self.local_rank = local_rank - self.rank = rank - vllm_config.parallel_config.rank = rank - - self.distributed_init_method = distributed_init_method - - self.is_driver_worker = is_driver_worker - if self.is_driver_worker: - assert self.rank == 0, "The driver worker must have rank 0." - - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils import init_cached_hf_modules - init_cached_hf_modules() - - # Setup OpenMP threads affinity. - omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND - self.local_omp_cpuid = "all" - if omp_cpuids == "auto": - self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes( - ) - else: - self.local_omp_cpuid = omp_cpuids.split("|")[rank] - - # Return hidden states from target model if the draft model is an - # mlp_speculator - speculative_config = self.speculative_config - model_config = self.model_config - speculative_args = {} if speculative_config is None \ - or (speculative_config.draft_model_config.model == - model_config.model) \ - or (speculative_config.draft_model_config.hf_config.model_type - not in ["medusa", "mlp_speculator", "eagle"]) \ - else {"return_hidden_states": True} - ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner - if self.model_config.runner_type == "pooling": - ModelRunnerClass = CPUPoolingModelRunner - elif self.model_config.is_encoder_decoder: - ModelRunnerClass = CPUEncoderDecoderModelRunner - self.model_runner: CPUModelRunnerBase = ModelRunnerClass( - vllm_config=vllm_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=is_driver_worker, - **speculative_args, - ) - if model_runner_cls is not None: - self.model_runner = model_runner_cls(self.model_runner) - # Uninitialized cache engine. Will be initialized by - # initialize_cache. - self.cache_engine: List[CPUCacheEngine] - # Initialize cpu_cache as pooling models don't initialize kv_caches - self.cpu_cache: Optional[List[List[torch.Tensor]]] = None - - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace - if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR - logger.info("Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - ], - with_stack=True, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, use_gzip=True)) - else: - self.profiler = None - - def start_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.start() - - def stop_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.stop() - - def init_device(self) -> None: - if self.local_omp_cpuid != "all": - ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) - if ret: - logger.info(ret) - - # Note: unique identifier for creating allreduce shared memory - os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split( - ":")[-1] - self.device = torch.device("cpu") - self.init_distributed_environment() - # Set random seed. - set_random_seed(self.model_config.seed) - - def load_model(self): - self.model_runner.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of blocks available for the KV cache. - - This determines how many KV blocks can fit into the configured CPU - KV cache space. - - Note that since vLLM assumes a block resides on GPU if it can be - modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0. - This allows us to reuse the scheduler of vLLM without generalizing it - to different devices. - """ - # For CPU device, the block number will be calculated based on the - # cpu_kvcache_space. - cache_block_size = self.get_cache_block_size_bytes() - num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes // - cache_block_size) - num_cpu_blocks = max(num_cpu_blocks, 0) - - # Note: To reuse the cache management procedure, - # use cpu cache as 'gpu cache'. - num_gpu_blocks = num_cpu_blocks - num_cpu_blocks = 0 - return num_gpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the KV cache. Currently, swappable CPU memory is not - supported. - - Since this worker does not support GPUs, we use the num_gpu_blocks to - determine how many non-swappable CPU blocks to allocate. - """ - assert (num_cpu_blocks == 0 - ), f"{type(self)} does not support swappable cache" - - # Note: To reuse the cache management procedure, - # use cpu cache as 'gpu cache'. - num_cpu_blocks = num_gpu_blocks - - self._validate_num_cpu_blocks(num_cpu_blocks) - self.cache_config.num_gpu_blocks = num_cpu_blocks - self.cache_config.num_cpu_blocks = 0 - - # Initialize the cache. - self._init_cache_engine() - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.model_runner.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.model_runner.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - return self.model_runner.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.model_runner.list_loras() - - def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None: - """Raise errors if the num_cpu_blocks is invalid. - """ - if num_cpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `VLLM_CPU_KVCACHE_SPACE` when " - "initializing the engine.") - - max_seq_len = self.cache_config.block_size * num_cpu_blocks - if self.model_config.max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({self.model_config.max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when " - "initializing the engine.") - - def _init_cache_engine(self) -> None: - self.cache_engine = [ - CPUCacheEngine(self.cache_config, self.model_config, - self.parallel_config, self.device_config) - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - self.cpu_cache = [ - self.cache_engine[ve].cpu_cache - for ve in range(self.parallel_config.pipeline_parallel_size) - ] - bind_kv_cache(self.compilation_config.static_forward_context, - self.cpu_cache) - self.model_runner.block_size = self.cache_engine[0].block_size - - assert all( - self.cpu_cache[ve] is not None - for ve in range(self.parallel_config.pipeline_parallel_size)) - - # Populate the cache to warmup the memory - for ve in range(self.parallel_config.pipeline_parallel_size): - for layer_cache in self.cpu_cache[ve]: - layer_cache.fill_(0) - - @property - def do_metadata_broadcast(self) -> bool: - return self.parallel_config.tensor_parallel_size > 1 - - @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - return self.cpu_cache - - @property - def vocab_size(self) -> int: - return self.model_runner.vocab_size - - @property - def max_model_len(self) -> int: - return self.model_config.max_model_len - - def execute_worker( - self, - worker_input: WorkerInput, - ) -> None: - if (worker_input.blocks_to_copy is not None - and worker_input.blocks_to_copy.numel() > 0): - self.cache_engine[worker_input.virtual_engine].copy( - worker_input.blocks_to_copy) - - @torch.inference_mode() - def prepare_worker_input( - self, execute_model_req: ExecuteModelRequest) -> WorkerInput: - assert execute_model_req is not None - virtual_engine: int = execute_model_req.virtual_engine - num_seq_groups: int = len(execute_model_req.seq_group_metadata_list) - blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, - device="cpu", - dtype=torch.int64).view(-1, 2) - assert len(execute_model_req.blocks_to_swap_in) == 0 - assert len(execute_model_req.blocks_to_swap_out) == 0 - return WorkerInput( - num_seq_groups=num_seq_groups, - blocks_to_copy=blocks_to_copy, - virtual_engine=virtual_engine, - ) - - def init_distributed_environment(self) -> None: - """Initialize the distributed environment.""" - - parallel_config = self.parallel_config - rank = self.rank - distributed_init_method = self.distributed_init_method - init_distributed_environment( - world_size=parallel_config.world_size, - rank=rank, - distributed_init_method=distributed_init_method, - backend="gloo", - ) - - # A small all_reduce for warmup. - torch.distributed.all_reduce(torch.zeros(1).cpu()) - - ensure_model_parallel_initialized( - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) - - def get_cache_block_size_bytes(self) -> int: - """Return the size in bytes of a single KV cache block. - """ - return CPUCacheEngine.get_cache_block_size( - self.cache_config.block_size, self.cache_config.cache_dtype, - self.model_config, self.parallel_config) - - def get_cpus_id_binding_based_on_numa_nodes(self) -> str: - """Return CPUs id binding based on NUMA nodes. - """ - rank_to_cpus = self.local_omp_cpuid - # Setup OpenMP thread affinity based on NUMA nodes automatically - world_size = self.vllm_config.parallel_config.world_size - libnuma_found = util.find_spec("numa") is not None - psutil_found = util.find_spec("psutil") is not None - if libnuma_found and psutil_found: - import psutil - from numa import info - cpu_count = psutil.cpu_count(logical=False) - cpus_allow_list = psutil.Process().cpu_affinity() - numa_size = info.get_num_configured_nodes() - cpu_count_per_numa = cpu_count // numa_size - num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU, - cpu_count_per_numa // 2) - - # check allow node_to_cpus list - node_to_cpus = [] - for i in range(numa_size): - node_intersect = set( - info.node_to_cpus(i)).intersection(cpus_allow_list) - if bool(node_intersect): - node_to_cpus.append(list(node_intersect)) - - if world_size > len(node_to_cpus): - logger.error( - "Auto thread-binding failed due to " - "world size: %d is larger than " - "allowed NUMA nodes number: %d." - "Please try to bind threads manually.", world_size, - len(node_to_cpus)) - else: - end = cpu_count_per_numa - num_of_reserved_cpu - rank_to_cpus_list = node_to_cpus[self.rank][:end] - rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list) - logger.info("auto thread-binding list: %s", rank_to_cpus) - else: - logger.warning( - "Auto thread-binding is not supported due to " - "the lack of package numa and psutil," - "fallback to no thread-binding. To get better performance," - "please try to manually bind threads.") - return rank_to_cpus diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py deleted file mode 100644 index ed9f00166615..000000000000 --- a/vllm/worker/multi_step_tpu_worker.py +++ /dev/null @@ -1,108 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -from typing import Dict, Optional, Tuple - -import torch - -from vllm.distributed import broadcast_tensor_dict -from vllm.sequence import ExecuteModelRequest -from vllm.worker.tpu_model_runner import ModelInputForTPU -from vllm.worker.tpu_worker import TPUWorker -from vllm.worker.worker_base import WorkerInput - - -class MultiStepTPUWorker(TPUWorker): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.cached_model_input: Optional[ModelInputForTPU] = None - - def _get_driver_input_and_broadcast( - self, execute_model_req: ExecuteModelRequest - ) -> Tuple[ModelInputForTPU, WorkerInput, Dict[str, torch.Tensor]]: - assert self.is_driver_worker - assert execute_model_req.virtual_engine == 0 - - is_first_multi_step = execute_model_req.is_first_multi_step - is_last_step = execute_model_req.is_last_step - if is_first_multi_step: - worker_input: WorkerInput = self.prepare_worker_input( - execute_model_req=execute_model_req) - worker_input = dataclasses.replace( - worker_input, - num_steps=execute_model_req.num_lookahead_slots + 1) - model_input: ModelInputForTPU = ( - self.model_runner.prepare_model_input( - execute_model_req.seq_group_metadata_list, - execute_model_req.virtual_engine, - execute_model_req.finished_requests_ids)) - - if execute_model_req.async_callback: - model_input = dataclasses.replace( - model_input, - async_callback=execute_model_req.async_callback) - else: - assert self.cached_model_input is not None - model_input = self.cached_model_input - worker_input = WorkerInput() - model_input = dataclasses.replace( - model_input, - is_first_multi_step=is_first_multi_step, - is_last_step=is_last_step) - - if self.do_metadata_broadcast: - if is_first_multi_step: - broadcast_data = worker_input.as_broadcastable_tensor_dict() - broadcast_data.update( - model_input.as_broadcastable_tensor_dict()) - broadcast_tensor_dict(broadcast_data, src=0) - else: - broadcast_data = { - "is_first_multi_step": is_first_multi_step, - "is_last_step": is_last_step, - } - broadcast_tensor_dict(broadcast_data, src=0) - - # Retuning empty dict here to keep this compatible with - # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast` - return model_input, worker_input, {} - - def prepare_input( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[Tuple[ModelInputForTPU, WorkerInput, Dict[str, - torch.Tensor]]]: - if self.is_driver_worker: - if execute_model_req is None: - if self.do_metadata_broadcast: - broadcast_tensor_dict({}, src=0) - return None - - model_input, worker_input, _ = self._get_driver_input_and_broadcast( - execute_model_req) - if model_input.is_first_multi_step: - self.cached_model_input = model_input - return model_input, worker_input, {} - else: - broadcast_data = broadcast_tensor_dict(src=0) - if not broadcast_data: - return None - - if len(broadcast_data) == 2: - assert self.cached_model_input is not None - self.cached_model_input = dataclasses.replace( - self.cached_model_input, - is_first_multi_step=broadcast_data["is_first_multi_step"], - is_last_step=broadcast_data["is_last_step"]) - empty_worker_input = WorkerInput() - return self.cached_model_input, empty_worker_input, {} - - worker_input = WorkerInput.from_broadcasted_tensor_dict( - broadcast_data) - model_input = ( - self.model_runner. - make_model_input_from_broadcasted_tensor_dict(broadcast_data)) - self.cached_model_input = model_input - return model_input, worker_input, {} diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py deleted file mode 100644 index 336bc0bcec36..000000000000 --- a/vllm/worker/tpu_model_runner.py +++ /dev/null @@ -1,909 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import enum -import time -from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Type, Union) -from unittest.mock import patch - -import numpy as np -import torch -import torch.nn as nn -import torch_xla.core.xla_model as xm -import torch_xla.runtime as xr - -from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.config import VllmConfig -from vllm.forward_context import get_forward_context, set_forward_context -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader import get_model -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, - Logprob, SequenceGroupMetadata, SequenceOutput) -from vllm.worker.model_runner_base import ( - ModelRunnerBase, ModelRunnerInputBase, - _add_attn_metadata_broadcastable_dict, - _init_attn_metadata_from_tensor_dict) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - -# Here we utilize the behavior that out-of-bound index is ignored. -# FIXME(woosuk): Find a more reliable way to prevent possible bugs. -_PAD_SLOT_ID = 1_000_000_000 -# FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow. -_ENABLE_TOP_P = False -# FIXME(woosuk): A temporary hack to support `n > 1`. -# This can significantly affect the performance if too large. -_MAX_NUM_SAMPLES = 128 - - -class ExecutionMode(enum.Enum): - PREFILL = enum.auto() - DECODE = enum.auto() - PREFIX_PREFILL = enum.auto() - - def is_prefill(self) -> bool: - return self in (ExecutionMode.PREFILL, ExecutionMode.PREFIX_PREFILL) - - -@dataclass(frozen=True) -class ModelInputForTPU(ModelRunnerInputBase): - token_ids: torch.Tensor - position_ids: torch.Tensor - attn_metadata: AttentionMetadata - input_lens: torch.Tensor - t: torch.Tensor - p: torch.Tensor - num_samples: int - n: List[int] - seq_groups: List[List[int]] - is_first_multi_step: bool = True - is_last_step: bool = True - virtual_engine: int = 0 - async_callback: Optional[Callable] = None - - def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: - tensor_dict = { - "token_ids": self.token_ids, - "position_ids": self.position_ids, - "input_lens": self.input_lens, - "t": self.t, - "p": self.p, - "num_samples": self.num_samples, - "n": self.n, - "seq_groups": self.seq_groups, - "is_first_multi_step": self.is_first_multi_step, - "is_last_step": self.is_last_step, - "virtual_engine": self.virtual_engine, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type["ModelInputForTPU"], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForTPU": - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): - - def __init__( - self, - vllm_config: VllmConfig, - is_driver_worker: bool = False, - ): - ModelRunnerBase.__init__(self, vllm_config=vllm_config) - self.is_driver_worker = is_driver_worker - - self.block_size = self.cache_config.block_size - self.max_num_blocks_per_seq = (self.model_config.max_model_len // - self.block_size) - self.block_tables = np.zeros( - (self.scheduler_config.max_num_seqs, self.max_num_blocks_per_seq), - dtype=np.int32) - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - self.cache_config.cache_dtype, - self.block_size, - self.model_config.is_attention_free, - False, - ) - self.cached_step_outputs: List[torch.Tensor] = [] - - smem_size = 512 * 1024 - block_table_size = 4 * self.block_tables.size - if block_table_size >= smem_size: - logger.warning( - "The max_model_len (%d) is too large. This may degrade the " - "performance due to the insufficient smem size. Consider " - "setting --max-model-len to a smaller value, like %d.", - self.model_config.max_model_len, - self.model_config.max_model_len / - (block_table_size / smem_size)) - - def load_model(self) -> None: - self.device = self.device_config.device - - # NOTE(woosuk): While the executor assigns the TP ranks to the worker - # process, the ranks can be different from the ranks internally assigned - # by the xm runtime. Therefore, there is a mismatch in the rank - # assignment between the gloo (cpu) runtime and the xm (tpu) runtime. - # This is not a problem in linear layers because all-reduce is - # rank-agnostic. However, it matters for all-gather as the ranks - # determine the order of concatenating the output tensors. - # As a workaround, we use the xm's rank assignment only when loading - # the embedding weights. - xm_tp_rank = xr.global_ordinal() - with patch( - "vllm.model_executor.layers.vocab_parallel_embedding." - "get_tensor_model_parallel_rank", - return_value=xm_tp_rank): - model = get_model(vllm_config=self.vllm_config) - model = model.eval() - xm.wait_device_ops() - model = ModelWrapper(model) - self.model = torch.compile(model, - backend="openxla", - fullgraph=True, - dynamic=False) - - def get_model(self) -> nn.Module: - return self.model.model - - def _dummy_run( - self, - batch_size: int, - seq_len: int, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - exec_mode: ExecutionMode, - ) -> None: - exec_mode = ExecutionMode(exec_mode) - if exec_mode.is_prefill(): - seq_len = (seq_len + 15) // 16 * 16 - token_ids = torch.zeros((batch_size, seq_len), - dtype=torch.int32, - device=self.device) - position_ids = torch.zeros((batch_size, seq_len), - dtype=torch.int32, - device=self.device) - slot_mapping = torch.zeros((batch_size, seq_len), - dtype=torch.int64, - device=self.device) - input_lens = torch.ones((batch_size, ), - dtype=torch.int32, - device=self.device) - if exec_mode == ExecutionMode.PREFILL: - attn_metadata = self.attn_backend.make_metadata( - num_prefills=batch_size, - num_prefill_tokens=batch_size * seq_len, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - block_tables=None, - context_lens=None, - effective_query_lens=None, - ) - else: - context_lens = torch.ones((batch_size, ), - dtype=torch.int32, - device=self.device) - block_tables = torch.tensor(self.block_tables[:batch_size], - dtype=torch.int32, - device=self.device) - effective_query_lens = torch.ones_like(context_lens) - attn_metadata = self.attn_backend.make_metadata( - num_prefills=batch_size, - num_prefill_tokens=batch_size * seq_len, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - block_tables=block_tables, - context_lens=context_lens, - effective_query_lens=effective_query_lens, - ) - else: - assert seq_len == 1 - token_ids = torch.zeros((batch_size, seq_len), - dtype=torch.int32, - device=self.device) - position_ids = torch.zeros((batch_size, seq_len), - dtype=torch.int32, - device=self.device) - slot_mapping = torch.zeros((batch_size, seq_len), - dtype=torch.int64, - device=self.device) - block_tables = torch.zeros( - (batch_size, self.max_num_blocks_per_seq), - dtype=torch.int32, - device=self.device) - context_lens = torch.ones((batch_size, ), - dtype=torch.int32, - device=self.device) - input_lens = torch.ones((batch_size, ), - dtype=torch.int32, - device=self.device) - attn_metadata = self.attn_backend.make_metadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=batch_size * seq_len, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - block_tables=block_tables, - context_lens=context_lens, - ) - t = torch.ones((batch_size, ), dtype=torch.float32, device=self.device) - p = torch.ones((batch_size, ), dtype=torch.float32, device=self.device) - num_samples = _MAX_NUM_SAMPLES if exec_mode.is_prefill() else 1 - - # NOTE(woosuk): There are two stages of compilation: torch.compile and - # XLA compilation. Using `mark_dynamic` can reduce the torch.compile - # overhead by reusing the FX graph for different shapes. - # However, the XLA graph will still require static shapes and needs to - # be re-compiled for every different shapes. This overhead is inevitable - # in the first run, but can be skipped afterwards as we cache the XLA - # graphs in the disk (VLLM_XLA_CACHE_PATH). - if exec_mode.is_prefill(): - # Prefll - torch._dynamo.mark_dynamic(token_ids, 1) - torch._dynamo.mark_dynamic(position_ids, 1) - torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 1) - else: - # Decode - torch._dynamo.mark_dynamic(token_ids, 0) - torch._dynamo.mark_dynamic(position_ids, 0) - torch._dynamo.mark_dynamic(input_lens, 0) - torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0) - torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0) - torch._dynamo.mark_dynamic(attn_metadata.block_tables, 0) - torch._dynamo.mark_dynamic(t, 0) - torch._dynamo.mark_dynamic(p, 0) - # Dummy run. - with set_forward_context(attn_metadata, self.vllm_config, 0): - self.model(token_ids, position_ids, input_lens, t, p, num_samples, - kv_caches) - - def warmup_model( - self, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - ) -> None: - # Prefill - logger.info("Compiling the model with different input shapes...") - start = time.time() - for batch_size in [1]: - seq_len = 16 - while seq_len <= self.model_config.max_model_len: - self._dummy_run(batch_size, - seq_len, - kv_caches, - exec_mode=ExecutionMode.PREFILL) - xm.wait_device_ops() - logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) - num_tokens = batch_size * seq_len - if num_tokens >= self.scheduler_config.max_num_batched_tokens: - break - seq_len = seq_len * 2 - - end = time.time() - logger.info("Compilation for prefill done in %.2f s.", end - start) - - # Prefix prefill - if self.cache_config.enable_prefix_caching: - logger.info("Compiling the model with different input shapes for " - "prefix prefill...") - start = time.time() - for batch_size in [1]: - seq_len = 16 - while seq_len <= self.model_config.max_model_len: - self._dummy_run(batch_size, - seq_len, - kv_caches, - exec_mode=ExecutionMode.PREFIX_PREFILL) - xm.wait_device_ops() - logger.info("batch_size: %d, seq_len: %d", batch_size, - seq_len) - num_tokens = batch_size * seq_len - if (num_tokens - >= self.scheduler_config.max_num_batched_tokens): - break - seq_len = seq_len * 2 - end = time.time() - logger.info("Compilation for prefix prefill done in %.2f s.", - end - start) - - # Decode - start = time.time() - seq_len = 1 - batch_size = 8 # Must be in sync with _get_padded_batch_size() - while True: - self._dummy_run(batch_size, - seq_len, - kv_caches, - exec_mode=ExecutionMode.DECODE) - xm.wait_device_ops() - logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) - - if batch_size >= self.scheduler_config.max_num_seqs: - break - batch_size = batch_size + 16 if batch_size >= 16 else batch_size * 2 - - end = time.time() - logger.info("Compilation for decode done in %.2f s.", end - start) - - def _prepare_prompt( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - prompt_lens: List[int] = [] - context_lens: List[int] = [] - slot_mapping: List[int] = [] - - for batch_idx, seq_group_metadata in enumerate( - seq_group_metadata_list): - assert seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - - seq_data = seq_group_metadata.seq_data[seq_id] - # Could include output tokens when a request is preempted. - prompt_tokens = seq_data.get_token_ids() - seq_len = len(prompt_tokens) - - num_computed_blocks = len(seq_group_metadata.computed_block_nums) - num_computed_tokens = num_computed_blocks * self.block_size - if num_computed_tokens > 0: - prompt_tokens = prompt_tokens[num_computed_tokens:] - context_lens.append(seq_len) - else: - context_lens.append(0) - - prompt_len = len(prompt_tokens) - prompt_lens.append(prompt_len) - - input_tokens.extend(prompt_tokens) - input_positions.extend(range(num_computed_tokens, seq_len)) - - assert seq_group_metadata.block_tables is not None - block_table = seq_group_metadata.block_tables[seq_id] - for i in range(num_computed_tokens, seq_len): - block_number = block_table[i // self.block_size] - block_offset = i % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) - if num_computed_tokens > 0: - self.block_tables[batch_idx, :len(block_table)] = block_table - - # Add paddings to EACH prompt to the smallest power of 2 that is - # greater than or equal to the prompt length. - # We pad the seq_len to reduce the compilation overhead. - # We execute each prompt individually (i.e., with batch_size 1) - # because the FlashAttention kernel does not support ragged inputs. - # TODO(woosuk): Use SplashAttention to support ragged inputs. - padded_prompt_len = _get_padded_prefill_len(prompt_len) - num_paddings = padded_prompt_len - prompt_len - input_tokens += [0] * num_paddings - input_positions += [0] * num_paddings - slot_mapping += [_PAD_SLOT_ID] * num_paddings - - assert len(prompt_lens) > 0 - num_prefills = len(prompt_lens) - input_tokens = torch.tensor(input_tokens, - dtype=torch.int32, - device="cpu") - input_positions = torch.tensor(input_positions, - dtype=torch.int32, - device="cpu") - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.int64, - device="cpu") - prompt_lens = torch.tensor(prompt_lens, - dtype=torch.int32, - device="cpu") - context_lens = torch.tensor(context_lens, - dtype=torch.int32, - device="cpu") - block_tables = torch.tensor(self.block_tables[:num_prefills], - dtype=torch.int32, - device="cpu") - attn_metadata = self.attn_backend.make_metadata( - num_prefills=num_prefills, - num_prefill_tokens=0, # NOTE: This is not used. - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - block_tables=block_tables, - context_lens=context_lens, - effective_query_lens=prompt_lens, - ) - return input_tokens, input_positions, attn_metadata, prompt_lens - - def _prepare_decode( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - slot_mapping: List[List[int]] = [] - context_lens: List[int] = [] - - batch_idx = 0 - for seq_group_metadata in seq_group_metadata_list: - assert not seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - for seq_id in seq_ids: - seq_data = seq_group_metadata.seq_data[seq_id] - generation_token = seq_data.get_last_token_id() - input_tokens.append([generation_token]) - - seq_len = seq_data.get_len() - position = seq_len - 1 - input_positions.append([position]) - context_lens.append(seq_len) - - assert seq_group_metadata.block_tables is not None - block_table = seq_group_metadata.block_tables[seq_id] - self.block_tables[batch_idx, :len(block_table)] = block_table - batch_idx += 1 - - block_number = block_table[position // self.block_size] - block_offset = position % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping.append([slot]) - - batch_size = _get_padded_batch_size(batch_idx) - num_paddings = batch_size - batch_idx - input_tokens = input_tokens + [[0]] * num_paddings - input_positions = input_positions + [[0]] * num_paddings - slot_mapping = slot_mapping + [[_PAD_SLOT_ID]] * num_paddings - context_lens = context_lens + [0] * num_paddings - - input_tokens = torch.tensor(input_tokens, - dtype=torch.int32, - device="cpu") - input_positions = torch.tensor(input_positions, - dtype=torch.int32, - device="cpu") - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.int64, - device="cpu") - context_lens = torch.tensor(context_lens, - dtype=torch.int32, - device="cpu") - block_tables = torch.tensor(self.block_tables[:batch_size], - dtype=torch.int32, - device="cpu") - input_lens = torch.tensor([1] * batch_size, - dtype=torch.int32, - device="cpu") - attn_metadata = self.attn_backend.make_metadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=batch_size, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - block_tables=block_tables, - context_lens=context_lens, - ) - return input_tokens, input_positions, attn_metadata, input_lens - - def _prepare_sample( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - padded_batch_size: int, - ) -> Tuple[torch.Tensor, torch.Tensor, List[int]]: - assert len(seq_group_metadata_list) > 0 - t = [] - p = [] - n = [] - for seq_group_metadata in seq_group_metadata_list: - sampling_params = seq_group_metadata.sampling_params - t.append(sampling_params.temperature) - if sampling_params.top_p != 1 and not _ENABLE_TOP_P: - raise NotImplementedError( - "Top-p sampling is currently disabled for the TPU backend " - "due to performance issues.") - p.append(sampling_params.top_p) - if sampling_params.top_k > 0: - raise NotImplementedError( - "Top-k sampling is currently disabled for the TPU backend " - "due to performance issues.") - if sampling_params.n > _MAX_NUM_SAMPLES: - raise NotImplementedError( - f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU " - "backend.") - n.append(sampling_params.n) - if sampling_params.logprobs is not None: - raise NotImplementedError( - "logprobs is not currently supported by the TPU backend.") - if sampling_params.prompt_logprobs is not None: - raise NotImplementedError( - "prompt_logprobs is not currently supported by the TPU " - "backend.") - - # Repeat the sampling params if the seq group has multiple seqs. - num_seqs = len(seq_group_metadata.seq_data) - t += [t[-1]] * (num_seqs - 1) - p += [p[-1]] * (num_seqs - 1) - n += [n[-1]] * (num_seqs - 1) - - num_paddings = padded_batch_size - len(t) - t += [1.0] * num_paddings - p += [1.0] * num_paddings - - t = torch.tensor(t, dtype=torch.float32, device="cpu") - p = torch.tensor(p, dtype=torch.float32, device="cpu") - return t, p, n - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, - ) -> ModelInputForTPU: - del finished_requests_ids # Unused. - assert virtual_engine == 0 - assert len(seq_group_metadata_list) > 0 - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = seq_group_metadata_list[0].is_prompt - if is_prompt: - inputs = self._prepare_prompt(seq_group_metadata_list) - else: - inputs = self._prepare_decode(seq_group_metadata_list) - input_tokens, input_positions, attn_metadata, input_lens = inputs - padded_batch_size = input_tokens.shape[0] - t, p, n = self._prepare_sample(seq_group_metadata_list, - padded_batch_size) - num_samples = _MAX_NUM_SAMPLES if is_prompt else 1 - - seq_groups = [ - list(metadata.seq_data.keys()) - for metadata in seq_group_metadata_list - ] - return ModelInputForTPU(input_tokens, input_positions, attn_metadata, - input_lens, t, p, num_samples, n, seq_groups) - - def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> ModelInputForTPU: - model_input = ModelInputForTPU.from_broadcasted_tensor_dict( - tensor_dict, attn_backend=self.attn_backend) - return model_input - - @torch.no_grad() - def execute_model( - self, - model_input: ModelInputForTPU, - kv_caches: Optional[List[Any]], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> List[SamplerOutput]: - assert intermediate_tensors is None - if not model_input.is_first_multi_step: - if not model_input.is_last_step: - return [] - - use_async_out_proc = model_input.async_callback is not None - sampler_outputs = [] - num_outputs = len(self.cached_step_outputs) - for i in range(num_outputs): - next_token_ids = self.cached_step_outputs.pop(0) - next_token_ids = next_token_ids.cpu().tolist() - sampler_output = _make_decode_output(next_token_ids, - model_input.seq_groups) - sampler_outputs.append(sampler_output) - - if i < num_outputs - 1 and use_async_out_proc: - assert model_input.async_callback is not None - ctx = model_input.async_callback.keywords[ # type: ignore - "ctx"] - ctx.append_output( - outputs=[sampler_output], - seq_group_metadata_list=ctx.seq_group_metadata_list, - scheduler_outputs=ctx.scheduler_outputs, - is_async=False, - is_last_step=False, - is_first_step_output=i == 0) - model_input.async_callback() - if use_async_out_proc: - return [sampler_outputs[-1]] - else: - return sampler_outputs - - is_prompt = model_input.attn_metadata.num_prefills > 0 - if is_prompt: - assert num_steps == 1 - # NOTE(woosuk): Since the FlashAttention kernel does not support - # ragged inputs, we split the prompts into different batches and - # process them separately. This is a temporary hack that should be - # optimized by using SplashAttention. - orig_slot_mapping = model_input.attn_metadata.slot_mapping - orig_block_tables = model_input.attn_metadata.block_tables - orig_context_lens = model_input.attn_metadata.context_lens - orig_effective_query_lens = \ - model_input.attn_metadata.effective_query_lens - batch_size = model_input.input_lens.shape[0] - start_idx = 0 - next_token_ids = [] - for i in range(batch_size): - # Get the actual prefill_len. - prefill_len = model_input.input_lens[i:i + 1].item() - prefill_len = _get_padded_prefill_len(prefill_len) - end_idx = start_idx + prefill_len - - token_ids = model_input.token_ids[None, start_idx:end_idx].to( - self.device) - position_ids = model_input.position_ids[None, - start_idx:end_idx].to( - self.device) - attn_metadata = model_input.attn_metadata - attn_metadata.num_prefills = 1 - attn_metadata.slot_mapping = orig_slot_mapping[ - None, start_idx:end_idx].to(self.device) - if orig_context_lens[i].item() > 0: - attn_metadata.context_lens = orig_context_lens[i:i + 1].to( - self.device) - attn_metadata.block_tables = orig_block_tables[ - i].unsqueeze(0).to(self.device) - attn_metadata.effective_query_lens = \ - orig_effective_query_lens[i:i + 1].to(self.device) - else: - attn_metadata.context_lens = None - attn_metadata.block_tables = None - attn_metadata.effective_query_lens = None - input_lens = model_input.input_lens[i:i + 1].to(self.device) - t = model_input.t[i:i + 1].to(self.device) - p = model_input.p[i:i + 1].to(self.device) - with set_forward_context(model_input.attn_metadata, - self.vllm_config, - model_input.virtual_engine): - output_token_ids = self.model(token_ids, position_ids, - input_lens, t, p, - model_input.num_samples, - kv_caches) - next_token_ids.append(output_token_ids[0]) - start_idx = end_idx - - if model_input.async_callback is not None: - model_input.async_callback() - # Retrieve the outputs to CPU. - next_token_ids = [ - output_token_ids.cpu().tolist() - for output_token_ids in next_token_ids - ] - - # NOTE(woosuk): Minimal code to construct the sampler outputs. - # The TPU backend does not reuse the sampler, since the TPU backend - # does not support advanced sampling parameters such as logprobs. - zero_logprob = Logprob(0.0) - sampler_outputs = [] - for i, seq_group in enumerate(model_input.seq_groups): - seq_ids = seq_group - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - seq_outputs = [] - for j in range(model_input.n[i]): - next_token_id = next_token_ids[i][j] - seq_outputs.append( - SequenceOutput(seq_id, next_token_id, - {next_token_id: zero_logprob})) - sampler_outputs.append( - CompletionSequenceGroupOutput(seq_outputs, None)) - return [SamplerOutput(sampler_outputs)] - else: - token_ids = model_input.token_ids.to(self.device) - position_ids = model_input.position_ids.to(self.device) - attn_metadata = model_input.attn_metadata - attn_metadata.slot_mapping = attn_metadata.slot_mapping.to( - self.device) - attn_metadata.block_tables = attn_metadata.block_tables.to( - self.device) - attn_metadata.context_lens = attn_metadata.context_lens.to( - self.device) - t = model_input.t.to(self.device) - p = model_input.p.to(self.device) - input_lens = model_input.input_lens.to(self.device) - for i in range(num_steps): - slot_mapping = attn_metadata.slot_mapping - with set_forward_context(model_input.attn_metadata, - self.vllm_config, - model_input.virtual_engine): - output_token_ids = self.model(token_ids, position_ids, - input_lens, t, p, - model_input.num_samples, - kv_caches) - self.cached_step_outputs.append(output_token_ids) - - if i < num_steps - 1: - # Prepare the inputs for the next step. - token_ids = output_token_ids.unsqueeze(dim=1).int() - position_ids = position_ids + 1 - attn_metadata.context_lens = attn_metadata.context_lens + 1 - - block_tables = attn_metadata.block_tables - block_number = block_tables.gather( - 1, - position_ids.long() // self.block_size) - block_offset = position_ids % self.block_size - - is_padding = slot_mapping == _PAD_SLOT_ID - slot_mapping = block_number * self.block_size + block_offset - slot_mapping = slot_mapping.long() - slot_mapping = torch.where(is_padding, _PAD_SLOT_ID, - slot_mapping) - attn_metadata.slot_mapping = slot_mapping - - if model_input.async_callback is not None: - model_input.async_callback() - - if num_steps > 1: - return [] - # Retrieve the outputs to CPU. - next_token_ids = self.cached_step_outputs.pop(0) - next_token_ids = next_token_ids.cpu().tolist() - sampler_output = _make_decode_output(next_token_ids, - model_input.seq_groups) - return [sampler_output] - - -class ModelWrapper(nn.Module): - - def __init__(self, model: nn.Module): - super().__init__() - self.model = model - - def forward( - self, - token_ids: torch.Tensor, - position_ids: torch.Tensor, - input_lens: torch.Tensor, - t: torch.Tensor, - p: torch.Tensor, - num_samples: int, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - ) -> torch.Tensor: - """Executes the forward pass of the model and samples the next token. - - Args: - token_ids: The input token IDs of shape [batch_size, seq_len]. - position_ids: The input position IDs of shape [batch_size, seq_len]. - input_lens: The actual input lengths of shape [batch_size]. - t: The sampling temperature of shape [batch_size]. - p: The top-p probability of shape [batch_size]. - num_samples: Number of samples to draw from each logits vector. - kv_caches: The key and value caches. They can be None during the - memory profiling at initialization. - """ - batch_size, seq_len = token_ids.shape - # Calculate the positions to sample from. - start_indices = torch.arange( - batch_size, dtype=torch.int32, device=input_lens.device) * seq_len - logits_indices = start_indices + input_lens - 1 - attn_metadata = get_forward_context().attn_metadata - - # FIXME(woosuk): This is a temporary hack to avoid using the existing - # sampler and sampling metadata. - sampling_metadata = SamplingMetadata( - seq_groups=[], - selected_token_indices=logits_indices, - categorized_sample_indices={}, - num_prompts=attn_metadata.num_prefills, - ) - - # Skip this in memory profiling at initialization. - if kv_caches[0][0].numel() > 0: - # index_copy_(slot_mapping) only works when the inserted dimension - # is 0. However, the KV cache in the Pallas backend has the shape - # [num_kv_heads, num_blocks, block_size, head_size]. To make it - # work, we need to flatten the first three dimensions and modify - # the slot_mapping accordingly. - num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape - slot_mapping = attn_metadata.slot_mapping - slot_mapping = slot_mapping.flatten() - head_indices = torch.arange(0, - num_kv_heads, - device=slot_mapping.device, - dtype=slot_mapping.dtype) - head_indices *= block_size * num_blocks - slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view( - -1, num_kv_heads) - slot_mapping = slot_mapping + head_indices.view(1, -1) - slot_mapping = slot_mapping.flatten() - attn_metadata.slot_mapping = slot_mapping - - hidden_states = self.model(token_ids, position_ids) - hidden_states = hidden_states.flatten(0, 1) - logits = self.model.compute_logits(hidden_states, sampling_metadata) - - # Argmax sampling. - argmax_token_ids = torch.argmax(logits, dim=-1, keepdim=True) - argmax_token_ids = argmax_token_ids.repeat(1, num_samples) - - # Zero temperature means greedy decoding. Avoid division by zero. - nonzero_t = torch.where(t != 0, t, 1.0) - logits = logits / nonzero_t.unsqueeze(dim=1) - if _ENABLE_TOP_P: - logits = _apply_top_p(logits, p.unsqueeze(dim=1)) - - # Random sampling. - probs = torch.softmax(logits, dim=-1, dtype=torch.float32) - sampled_token_ids = torch.multinomial(probs, - num_samples, - replacement=True) - if num_samples == 1: - argmax_token_ids = argmax_token_ids.squeeze(dim=-1) - sampled_token_ids = sampled_token_ids.squeeze(dim=-1) - next_token_ids = torch.where(t != 0, sampled_token_ids, - argmax_token_ids) - return next_token_ids - - -def _get_padded_prefill_len(x: int) -> int: - # NOTE(woosuk): The pallas FlashAttention kernel requires the sequence - # length to be a multiple of 16. We pad the prompt length to the nearest - # multiple of 16. This is also good for performance. - if x <= 16: - return 16 - return 1 << (x - 1).bit_length() - - -def _get_padded_batch_size(batch_size: int) -> int: - # The GMM Pallas kernel requires num_tokens * topk to be a multiple of 16. - # To meet this requirement in the simplest way, we set the minimal batch - # size to 8. - if batch_size <= 8: - return 8 - else: - return ((batch_size + 15) // 16) * 16 - - -def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor: - logits_sorted = torch.sort(logits, dim=-1, descending=True).values - sorted_cum_probs = torch.cumsum(logits_sorted.softmax(dim=-1), dim=-1) - cutoff_index = torch.sum(sorted_cum_probs < p, dim=-1, keepdim=True) - cutoff_logit = torch.gather(logits_sorted, -1, cutoff_index) - logits = logits.masked_fill_(logits < cutoff_logit, -float("inf")) - return logits - - -def _make_decode_output( - next_token_ids: List[int], - seq_groups: List[List[int]], -) -> SamplerOutput: - zero_logprob = Logprob(0.0) - sampler_outputs = [] - batch_idx = 0 - for seq_group in seq_groups: - seq_ids = seq_group - seq_outputs = [] - for seq_id in seq_ids: - next_token_id = next_token_ids[batch_idx] - seq_outputs.append( - SequenceOutput(seq_id, next_token_id, - {next_token_id: zero_logprob})) - batch_idx += 1 - sampler_outputs.append(CompletionSequenceGroupOutput( - seq_outputs, None)) - return SamplerOutput(sampler_outputs) diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py deleted file mode 100644 index ad5ed19e2f89..000000000000 --- a/vllm/worker/tpu_worker.py +++ /dev/null @@ -1,337 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -from typing import List, Optional, Tuple, Union - -import torch -import torch_xla.core.xla_model as xm -import torch_xla.debug.profiler as xp -import torch_xla.runtime as xr - -import vllm.envs as envs -from vllm.config import VllmConfig -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.logger import init_logger -from vllm.model_executor import set_random_seed -from vllm.sequence import ExecuteModelRequest -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache, get_dtype_size -from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, - LoRANotSupportedWorkerBase, WorkerBase, - WorkerInput) - -logger = init_logger(__name__) - - -class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - is_driver_worker: bool, - ) -> None: - WorkerBase.__init__(self, vllm_config=vllm_config) - self.parallel_config.rank = rank - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - - assert self.device_config.device_type == "tpu" - if self.cache_config.cache_dtype == "auto": - self.cache_dtype = self.model_config.dtype - else: - self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ - self.cache_config.cache_dtype] - - self.model_runner: TPUModelRunner = TPUModelRunner( - vllm_config=vllm_config, is_driver_worker=is_driver_worker) - - if self.model_config.seed is None: - self.model_config.seed = 0 - - if vllm_config.lora_config is not None: - raise NotImplementedError( - "The V0 TPU backend doesn't support LoRA serving") - - def init_device(self) -> None: - os.environ["PJRT_DEVICE"] = "TPU" - torch.set_grad_enabled(False) - torch.set_default_dtype(self.model_config.dtype) - - # NOTE(woosuk): This is just to initialize the TP group and broadcast - # the input objects on CPU. The all-reduce and all-gather ops on TPU - # are invoked by `xm.all_reduce` and `xm.all_gather` which use their - # own context. - init_distributed_environment( - world_size=self.parallel_config.world_size, - rank=self.rank, - local_rank=self.local_rank, - distributed_init_method=self.distributed_init_method, - backend="gloo", - ) - ensure_model_parallel_initialized( - self.parallel_config.tensor_parallel_size, - self.parallel_config.pipeline_parallel_size) - - # Device initialization should happen after initializing the distributed - # runtime. - self.device = xm.xla_device() - self.device_config.device = self.device - - # Set random seed. - set_random_seed(self.model_config.seed) - xm.set_rng_state(self.model_config.seed, self.device) - - # Increase the cache size limit, which is the maximum number of - # dynamo graphs that can be compiled. - # NOTE(woosuk): Usually, we compile 10-15 graphs for prefill and - # 30-40 graphs for decode. 128 is an arbitrary safe number. - torch._dynamo.config.cache_size_limit = 128 - # Use persistent cache to avoid XLA recompilation. - # NOTE(woosuk): Set per-rank cache path since different ranks - # can have slightly different XLA graphs. - world_size = self.parallel_config.world_size - rank = xr.global_ordinal() - # The PyTorch/XLA compilation cache uses the Torch IR to generate keys. - # Consequently, changes in optimization flags, which affect compilation - # results, don't change the cache key. This can result in the wrong - # compilation being used. To prevent this, disabling the XLA compilation - # cache during development is recommended.We can disable it by - # `export VLLM_XLA_CACHE_PATH=` - if envs.VLLM_XLA_CACHE_PATH: - per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH, - f"tp{world_size}_rank{rank}") - xr.initialize_cache(per_rank_path, readonly=False) - - self.profiler = None - if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1: - # For TPU, we can only have 1 active profiler session for 1 profiler - # server. So we only profile on rank0. - self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR - logger.info("Profiling enabled. Traces will be saved to: %s", - self.profile_dir) - self.profiler = xp.start_server(9012) - - def start_profile(self): - if self.rank < 1: - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - xp.start_trace(self.profile_dir) - - def stop_profile(self): - if self.rank < 1: - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - xp.stop_trace() - - def load_model(self): - self.model_runner.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - num_layers = self.model_config.get_num_layers(self.parallel_config) - head_size = self.model_config.get_head_size() - num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) - - # use an empty tensor instead of `None`` to force Dynamo to pass - # it by reference, rather by specializing on the value ``None``. - # the `dtype` argument does not matter, and we use `float32` as - # a placeholder (it has wide hardware support). - kv_caches = [(torch.tensor([], dtype=torch.float32, - device=self.device), - torch.tensor([], dtype=torch.float32, - device=self.device)) - for _ in range(num_layers)] - bind_kv_cache(self.compilation_config.static_forward_context, - [kv_caches]) - self.model_runner._dummy_run( - batch_size=1, - seq_len=self.scheduler_config.max_num_batched_tokens, - kv_caches=kv_caches, - exec_mode=ExecutionMode.PREFILL, - ) - # Synchronize before measuring the memory usage. - xm.wait_device_ops() - - # Get the maximum amount of memory used by the model weights and - # intermediate activations. - m = xm.get_memory_info(self.device) - total_memory_size = m["bytes_limit"] - profiled = m["peak_bytes_used"] # Weights + intermediate activations. - - # Calculate the TPU KV cache size based on profiling. - usable_memory_size = int(total_memory_size * - self.cache_config.gpu_memory_utilization) - tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0) - dtype_bytes = get_dtype_size(self.cache_dtype) - block_size_bytes = (dtype_bytes * self.cache_config.block_size * - num_layers * 2 * head_size * num_kv_heads) - num_tpu_blocks = tpu_kv_cache_bytes // block_size_bytes - num_tpu_blocks = (num_tpu_blocks // 8) * 8 # Round down to 8. - - # Calculate the CPU KV cache size based on the config. - num_cpu_blocks = int(self.cache_config.swap_space_bytes // - block_size_bytes) - num_cpu_blocks = (num_cpu_blocks // 8) * 8 # Round down to 8. - return num_tpu_blocks, num_cpu_blocks - - def initialize_cache( - self, - num_gpu_blocks: int, - num_cpu_blocks: int, - ) -> None: - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - self.block_size = self.cache_config.block_size - - dtype = self.cache_dtype - num_layers = self.model_config.get_num_layers(self.parallel_config) - num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) - head_size = self.model_config.get_head_size() - - self.cpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] - self.tpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] - tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape( - num_gpu_blocks, self.block_size, num_kv_heads, head_size) - cpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape( - num_cpu_blocks, self.block_size, num_kv_heads, head_size) - for _ in range(num_layers): - tpu_k_cache = torch.zeros(tpu_cache_shape, - dtype=dtype, - device=self.device) - tpu_v_cache = torch.zeros_like(tpu_k_cache) - self.tpu_cache.append((tpu_k_cache, tpu_v_cache)) - cpu_k_cache = torch.zeros(cpu_cache_shape, - dtype=dtype, - device="cpu") - cpu_v_cache = torch.zeros_like(cpu_k_cache) - self.cpu_cache.append((cpu_k_cache, cpu_v_cache)) - bind_kv_cache(self.compilation_config.static_forward_context, - [self.tpu_cache]) - self._warmup_model() - - def _warmup_model(self) -> None: - # FIXME(woosuk): Here we are abusing `enforce_eager` which is defined - # for CUDA graphs. We should refactor this part. - if not self.model_config.enforce_eager: - # Warm up the model with all possible input shapes so that - # compilation never happens during the actual execution. - # This may take ~30 mins for the first run and ~20 mins for the - # subsequent runs. - # If `enforce_eager` is True, the ahead-of-time compilation is - # skipped and the compilation happens during the actual execution, - # which is bad for performance but useful for development. - self.model_runner.warmup_model(self.tpu_cache) - - def get_cache_block_size_bytes(self) -> int: - head_size = self.model_config.get_head_size() - num_heads = self.model_config.get_num_kv_heads(self.parallel_config) - num_layers = self.model_config.get_num_layers(self.parallel_config) - - key_cache_block = self.cache_config.block_size * num_heads * head_size - value_cache_block = key_cache_block - total = num_layers * (key_cache_block + value_cache_block) - dtype_size = get_dtype_size(self.cache_dtype) - return dtype_size * total - - @property - def do_metadata_broadcast(self) -> bool: - return self.parallel_config.tensor_parallel_size > 1 - - @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - # NOTE(woosuk): This assumes virtual_engine == 0, i.e., no pipeline - # parallelism. - return [self.tpu_cache] - - def prepare_worker_input( - self, - execute_model_req: ExecuteModelRequest, - ) -> WorkerInput: - virtual_engine = execute_model_req.virtual_engine - num_seq_groups = len(execute_model_req.seq_group_metadata_list) - blocks_to_swap_in = _make_src_to_dst( - execute_model_req.blocks_to_swap_in, "cpu", self.device) - blocks_to_swap_out = _make_src_to_dst( - execute_model_req.blocks_to_swap_out, self.device, "cpu") - blocks_to_copy = _make_src_to_dst(execute_model_req.blocks_to_copy, - self.device, self.device) - return WorkerInput( - num_seq_groups=num_seq_groups, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - virtual_engine=virtual_engine, - ) - - def execute_worker(self, worker_input: WorkerInput) -> None: - virtual_engine = worker_input.virtual_engine - assert virtual_engine == 0 - attn_backend = self.model_runner.attn_backend - num_layers = self.model_config.get_num_layers(self.parallel_config) - - # Issue cache operations. - if worker_input.blocks_to_swap_in is not None: - src_indices, dst_indices = worker_input.blocks_to_swap_in - if src_indices.numel() > 0: - # Swap from CPU to TPU. - for i in range(num_layers): - tpu_k_cache, tpu_v_cache = self.tpu_cache[i] - cpu_k_cache, cpu_v_cache = self.cpu_cache[i] - k = cpu_k_cache[:, src_indices].to(self.device) - v = cpu_v_cache[:, src_indices].to(self.device) - _insert_kv(k, v, dst_indices, tpu_k_cache, tpu_v_cache) - - if worker_input.blocks_to_swap_out is not None: - src_indices, dst_indices = worker_input.blocks_to_swap_out - if src_indices.numel() > 0: - # Swap from TPU to CPU. - for i in range(num_layers): - tpu_k_cache, tpu_v_cache = self.tpu_cache[i] - cpu_k_cache, cpu_v_cache = self.cpu_cache[i] - cpu_k_cache[:, dst_indices] = tpu_k_cache[:, src_indices] - cpu_v_cache[:, dst_indices] = tpu_v_cache[:, src_indices] - - if worker_input.blocks_to_copy is not None: - src_indices, dst_indices = worker_input.blocks_to_copy - if src_indices.numel() > 0: - attn_backend.copy_blocks(self.tpu_cache, - (src_indices, dst_indices)) - - -def _make_src_to_dst( - mapping: List[Tuple[int, int]], - src_device: Union[torch.device, str], - dst_device: Union[torch.device, str], -) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: - if not mapping: - return None - - src_indices = [i for i, _ in mapping] - dst_indices = [i for _, i in mapping] - src_indices = torch.tensor(src_indices, - device=src_device, - dtype=torch.int64) - dst_indices = torch.tensor(dst_indices, - device=dst_device, - dtype=torch.int64) - return src_indices, dst_indices - - -@torch.compile(backend="openxla") -def _insert_kv( - k: torch.Tensor, - v: torch.Tensor, - indices: torch.Tensor, - tpu_k_cache: torch.Tensor, - tpu_v_cache: torch.Tensor, -) -> None: - torch.ops.xla.dynamo_set_buffer_donor_(tpu_k_cache, True) - torch.ops.xla.dynamo_set_buffer_donor_(tpu_v_cache, True) - tpu_k_cache[:, indices] = k - tpu_v_cache[:, indices] = v diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py deleted file mode 100644 index b2d3ce8526d5..000000000000 --- a/vllm/worker/xpu_model_runner.py +++ /dev/null @@ -1,606 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -import time -import weakref -from collections import defaultdict -from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Type, TypeVar) - -import torch -import torch.nn as nn - -from vllm.attention import get_attn_backend -from vllm.config import VllmConfig -from vllm.distributed import get_pp_group -from vllm.forward_context import set_forward_context -from vllm.inputs import INPUT_REGISTRY, InputRegistry -from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadataCache -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler -from vllm.model_executor.model_loader import get_model -from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalKwargs, MultiModalPlaceholderMap, - MultiModalRegistry) -from vllm.sampling_params import SamplingParams -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad -from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata -from vllm.worker.model_runner_base import ( - ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, - _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict, - _init_attn_metadata_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - -_PAD_SLOT_ID = -1 - -TModelInputForXPU = TypeVar('TModelInputForXPU', bound="ModelInputForXPU") - - -@dataclass(frozen=True) -class ModelInputForXPU(ModelRunnerInputBase): - """ - Used by the NeuronModelRunner. - """ - input_tokens: Optional[torch.Tensor] = None - input_positions: Optional[torch.Tensor] = None - attn_metadata: Optional["AttentionMetadata"] = None - multi_modal_kwargs: Optional[BatchedTensorInputs] = None - virtual_engine: Optional[int] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None - async_callback: Optional[Callable] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type[TModelInputForXPU], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> TModelInputForXPU: - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -@dataclass(frozen=True) -class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU): - """ - Used by the ModelRunner. - """ - sampling_metadata: Optional["SamplingMetadata"] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForXPUWithSamplingMetadata": - tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): - - def __init__(self, - runner: "XPUModelRunner", - finished_requests_ids: Optional[List[str]] = None) -> None: - super().__init__() - self.runner = runner - self.model_input_cls = self.runner._model_input_cls - self.attn_backend = self.runner.attn_backend - self.sliding_window = self.runner.sliding_window - self.block_size = self.runner.block_size - self.device = self.runner.device - - def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] - - def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): - self.seq_group_metadata_list.append(seq_group_metadata) - - def build(self) -> ModelInputForXPU: - is_prompt = self.seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, attn_metadata, seq_lens, - multi_modal_kwargs) = self._prepare_prompt( - self.seq_group_metadata_list) - else: - (input_tokens, input_positions, - attn_metadata) = self._prepare_decode( - self.seq_group_metadata_list) - seq_lens = None - multi_modal_kwargs = None - - return self.model_input_cls( - input_tokens=input_tokens, - input_positions=input_positions, - attn_metadata=attn_metadata, - multi_modal_kwargs=multi_modal_kwargs, - seq_lens=seq_lens, - query_lens=seq_lens, - ) - - def _prepare_prompt( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], - BatchedTensorInputs]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - slot_mapping: List[int] = [] - seq_lens: List[int] = [] - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - multi_modal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - - for seq_group_metadata in seq_group_metadata_list: - assert seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - - seq_data = seq_group_metadata.seq_data[seq_id] - prompt_tokens = seq_data.get_token_ids() - computed_len = seq_data.get_num_computed_tokens() - seq_len = len(prompt_tokens) - - seq_lens.append(seq_len) # Prompt token num - input_tokens.extend(prompt_tokens) # Token ids - - # Token position ids - # NOTE(woosuk): Here we assume that the first token in the prompt - # is always the first token in the sequence. - positions_range = range(computed_len, seq_len) - input_positions.extend(list(positions_range)) - - if seq_group_metadata.multi_modal_data: - # NOTE: mm_kwargs only includes the subset of multi-modal items - # that intersect with the current prefill positions. - mm_kwargs, placeholder_maps = MultiModalPlaceholderMap \ - .from_seq_group(seq_group_metadata, positions_range) - - multi_modal_kwargs_list.append(mm_kwargs) - - for modality, placeholder_map in placeholder_maps.items(): - multi_modal_placeholder_maps[modality].extend( - placeholder_map) - - if seq_group_metadata.block_tables is None: - # During memory profiling, the block tables are not initialized - # yet. In this case, we just use a dummy slot mapping. - slot_mapping.extend([_PAD_SLOT_ID] * seq_len) - continue - - # Compute the slot mapping. - block_table = seq_group_metadata.block_tables[seq_id] - # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, - # where start_idx is max(0, seq_len - sliding_window). - # For example, if the prompt len is 10, sliding window is 8, and - # block size is 4, the first two tokens are masked and the slot - # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. - start_idx = 0 - if self.sliding_window is not None: - start_idx = max(0, seq_len - self.sliding_window) - - for i in range(computed_len, seq_len): - if i < start_idx: - slot_mapping.append(_PAD_SLOT_ID) - continue - - block_number = block_table[i // - self.block_size] # type: ignore - block_offset = i % self.block_size # type: ignore - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) - - num_prompt_tokens = len(input_tokens) - - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) # type: ignore - input_positions = torch.tensor(input_positions, - dtype=torch.long, - device=self.device) # type: ignore - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.long, - device=self.device) # type: ignore - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - multi_modal_placeholder_maps.items() - } - - max_seqlen = max(seq_lens) - tmp = [0] - tmp.extend(seq_lens) - seqlen = torch.tensor(tmp) - seqlen_q = torch.cumsum(seqlen, dim=0).to(device=self.device) - - attn_metadata = self.attn_backend.make_metadata( - is_prompt=True, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=False, - seq_lens=seq_lens, - seqlen_q=seqlen_q, - max_seqlen=max_seqlen, - seq_lens_tensor=torch.tensor([]), - max_decode_seq_len=0, - num_prefills=len(seq_lens), - num_prefill_tokens=num_prompt_tokens, - num_decode_tokens=0, - block_tables=torch.tensor([], device=self.device, dtype=torch.int), - ) - - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - return (input_tokens, input_positions, attn_metadata, seq_lens, - multi_modal_kwargs) - - def _prepare_decode( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - slot_mapping: List[int] = [] - seq_lens: List[int] = [] - block_tables: List[List[int]] = [] - - for seq_group_metadata in seq_group_metadata_list: - assert not seq_group_metadata.is_prompt - assert seq_group_metadata.token_chunk_size == 1 - - seq_ids = list(seq_group_metadata.seq_data.keys()) - - for seq_id in seq_ids: - seq_data = seq_group_metadata.seq_data[seq_id] - generation_token = seq_data.get_last_token_id() - input_tokens.append(generation_token) - - seq_len = seq_data.get_len() - position = seq_len - 1 - input_positions.append(position) - - seq_len = seq_len if self.sliding_window is None else min( - seq_len, self.sliding_window) - seq_lens.append(seq_len) - - block_table = seq_group_metadata.block_tables[seq_id] - block_number = block_table[position // self.block_size] - block_offset = position % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) - - if self.sliding_window is not None: - sliding_window_blocks = (self.sliding_window // - self.block_size) - block_table = block_table[-sliding_window_blocks:] - block_tables.append(block_table) - - max_decode_seq_len = max(seq_lens) - - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) - input_positions = torch.tensor(input_positions, - dtype=torch.long, - device=self.device) - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.long, - device=self.device) - seq_lens_tensor = torch.tensor(seq_lens, - dtype=torch.int, - device=self.device) - - block_tables = make_tensor_with_pad( - block_tables, - pad=0, - dtype=torch.int, - device=self.device, - ) - - attn_metadata = self.attn_backend.make_metadata( - is_prompt=False, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - seq_lens=seq_lens, - seqlen_q=torch.tensor([]), - max_seqlen=0, - seq_lens_tensor=seq_lens_tensor, - max_decode_seq_len=max_decode_seq_len, - num_prefill_tokens=0, - num_decode_tokens=len(input_tokens), - num_prefills=0, - block_tables=block_tables, - ) - return ( - input_tokens, - input_positions, - attn_metadata, - ) - - -class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): - _model_input_cls: Type[ModelInputForXPUWithSamplingMetadata] = ( - ModelInputForXPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForXPUBuilder] = ModelInputForXPUBuilder - - def __init__( - self, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - return_hidden_states: bool = False, - input_registry: InputRegistry = INPUT_REGISTRY, - mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - ): - - ModelRunnerBase.__init__(self, vllm_config=vllm_config) - model_config = self.model_config - cache_config = self.cache_config - self.is_driver_worker = is_driver_worker - self.return_hidden_states = return_hidden_states - - self.device = self.device_config.device - - self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = model_config.get_sliding_window() - self.block_size = cache_config.block_size - - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - ) - - # Multi-modal data support - self.input_registry = input_registry - self.mm_registry = mm_registry - - # Lazy initialization. - self.model: nn.Module # Set after init_Model - self.sampler = get_sampler() - - self.sampling_metadata_cache: SamplingMetadataCache = \ - SamplingMetadataCache() \ - if self.parallel_config.pipeline_parallel_size == 1 else None - - self.builder = self._builder_cls(weakref.proxy(self)) - - def load_model(self) -> None: - with DeviceMemoryProfiler() as m: - self.model = get_model(vllm_config=self.vllm_config) - - self.model_memory_usage = m.consumed_memory - logger.info("Loading model weights took %.4f GiB", - self.model_memory_usage / GiB_bytes) - - def get_model(self) -> nn.Module: - return self.model - - @property - def vocab_size(self) -> int: - return self.model_config.get_vocab_size() - - @torch.inference_mode() - def profile_run(self) -> None: - # Enable top-k sampling to reflect the accurate memory usage. - sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - - # Profile memory usage with max_num_sequences sequences and the total - # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - # Additional GPU memory may be needed for multi-modal encoding, which - # needs to be accounted for when calculating the GPU blocks for - # vLLM blocker manager. - # To exercise the worst scenario for GPU memory consumption, - # the number of seqs (batch_size) is chosen to maximize the number - # of images processed. - max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( - self.model_config) - if max_mm_tokens > 0: - max_num_seqs_orig = max_num_seqs - max_num_seqs = min(max_num_seqs, - max_num_batched_tokens // max_mm_tokens) - if max_num_seqs < 1: - expr = (f"min({max_num_seqs_orig}, " - f"{max_num_batched_tokens} // {max_mm_tokens})") - logger.warning( - "Computed max_num_seqs (%s) to be less than 1. " - "Setting it to the minimum value of 1.", expr) - max_num_seqs = 1 - - batch_size = 0 - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - batch_size += seq_len - - dummy_data = self.input_registry \ - .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry) - - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: dummy_data.seq_data}, - sampling_params=sampling_params, - block_tables=None, - lora_request=None, - multi_modal_data=dummy_data.multi_modal_data, - multi_modal_placeholders=dummy_data.multi_modal_placeholders) - seqs.append(seq) - - finished_requests_ids = [seq.request_id for seq in seqs] - model_input = self.prepare_model_input( - seqs, finished_requests_ids=finished_requests_ids) - intermediate_tensors = None - if not get_pp_group().is_first_rank: - intermediate_tensors = self.model.make_empty_intermediate_tensors( - batch_size=batch_size, - dtype=self.model_config.dtype, - device=self.device) - self.execute_model(model_input, None, intermediate_tensors) - torch.xpu.synchronize() - return - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, - Any]) -> ModelInputForXPUWithSamplingMetadata: - return ( - ModelInputForXPUWithSamplingMetadata.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - )) - - def _prepare_model_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForXPUWithSamplingMetadata: - """Helper method to prepare the model input based on a given sequence - group. Prepares metadata needed for the base model forward pass but not - metadata for possible additional steps, e.g., sampling. - - """ - builder = self.builder - builder.prepare(finished_requests_ids) - for seq_group_metadata in seq_group_metadata_list: - builder.add_seq_group(seq_group_metadata) - - return builder.build() # type: ignore - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForXPUWithSamplingMetadata: - """Prepare the model input based on a given sequence group, including - metadata for the sampling step. - - """ - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - # Sampling metadata is only required for the final pp group - generators = self.get_generators(finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - model_input.seq_lens, - model_input.query_lens, - self.device, - pin_memory=False, - generators=generators, - cache=self.sampling_metadata_cache) - - return dataclasses.replace(model_input, - sampling_metadata=sampling_metadata, - virtual_engine=virtual_engine) - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForXPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError( - "XPUModelRunner does not support multi-step execution.") - - model_executable = self.model - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_start_time = time.time() - with set_forward_context(model_input.attn_metadata, self.vllm_config, - model_input.virtual_engine): - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - ) - # Compute the logits in the last pipeline stage. - if not get_pp_group().is_last_rank: - return hidden_or_intermediate_states - - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_end_time = time.time() - - # Compute the logits. - logits = self.model.compute_logits(hidden_or_intermediate_states, - model_input.sampling_metadata) - - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return [] - - if model_input.async_callback is not None: - model_input.async_callback() - - # Sample the next token. - output: SamplerOutput = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time - and output is not None): - model_forward_time = (model_forward_end_time - - model_forward_start_time) - # If there are multiple workers, we are still tracking the latency - # from the start time of the driver worker to the end time of the - # driver worker. The model forward time will then end up covering - # the communication time as well. - output.model_forward_time = model_forward_time - - return [output] diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py deleted file mode 100644 index fe321c059f52..000000000000 --- a/vllm/worker/xpu_worker.py +++ /dev/null @@ -1,186 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A XPU worker class.""" -import gc -import os -from typing import List, Optional, Tuple - -import intel_extension_for_pytorch # noqa: F401 -import oneccl_bindings_for_pytorch # noqa: F401 -import torch -import torch.distributed - -from vllm.config import VllmConfig -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.distributed.parallel_state import get_pp_group -from vllm.logger import init_logger -from vllm.model_executor import set_random_seed -from vllm.platforms import current_platform -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.worker import Worker -from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase -from vllm.worker.xpu_model_runner import XPUModelRunner - -logger = init_logger(__name__) - - -class XPUWorker(LoRANotSupportedWorkerBase, Worker): - """A worker class that executes (a partition of) the model on a GPU. - - Each worker is associated with a single XPU device. The worker is - responsible for maintaining the KV cache and executing the model on the - XPU. In case of distributed inference, each worker is assigned a partition - of the model. - """ - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - is_driver_worker: bool = False, - ) -> None: - WorkerBase.__init__(self, vllm_config=vllm_config) - device_config = self.device_config - parallel_config = self.parallel_config - assert device_config.device_type == "xpu" - assert current_platform.is_xpu() - - self.parallel_config.rank = rank - - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - if parallel_config and is_driver_worker: - assert rank % parallel_config.tensor_parallel_size == 0, \ - "Driver worker should be rank 0 of tensor parallel group." - - self.model_runner = XPUModelRunner( # type: ignore - vllm_config=vllm_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=is_driver_worker, - ) - # Uninitialized cache engine. Will be initialized by - # initialize_cache. - self.cache_engine: List[CacheEngine] - self.gpu_cache: Optional[List[List[torch.Tensor]]] - - def init_device(self) -> None: - if self.device_config.device.type == "xpu" and current_platform.is_xpu( - ): - self.device = torch.device(f"xpu:{self.local_rank}") - torch.xpu.set_device(self.device) - torch.xpu.empty_cache() - self.init_gpu_memory = torch.xpu.get_device_properties( - self.local_rank).total_memory - else: - raise RuntimeError( - f"Not support device type: {self.device_config.device}") - # Initialize the distributed environment. - self.init_worker_distributed_environment() - # Initialize the model. - set_random_seed(self.model_config.seed) - - # keep this method for `empty_cache` and `synchronize` api - @torch.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Profiles the peak memory usage of the model to determine how many - KV blocks may be allocated without OOMs. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - - Tip: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. - torch.xpu.empty_cache() - - # Execute a forward pass with dummy inputs to profile the memory usage - # of the model. - self.model_runner.profile_run() - - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. - torch.xpu.synchronize() - used_memory = torch.xpu.memory_allocated() - total_gpu_memory = torch.xpu.get_device_properties( - self.local_rank).total_memory - free_gpu_memory = total_gpu_memory - used_memory - - # NOTE(woosuk): Here we assume that the other processes using the same - # GPU did not change their memory usage during the profiling. - peak_memory = self.init_gpu_memory - free_gpu_memory - assert peak_memory > 0, ( - "Error in memory profiling. " - f"Initial free memory {self.init_gpu_memory}, current free memory" - f" {free_gpu_memory}. This happens when the GPU memory was " - "not properly cleaned up before initializing the vLLM instance.") - - cache_block_size = self.get_cache_block_size_bytes() - num_gpu_blocks = int( - (total_gpu_memory * self.cache_config.gpu_memory_utilization - - peak_memory) // cache_block_size) - num_cpu_blocks = int(self.cache_config.swap_space_bytes // - cache_block_size) - num_gpu_blocks = max(num_gpu_blocks, 0) - num_cpu_blocks = max(num_cpu_blocks, 0) - gc.collect() - torch.xpu.empty_cache() - return num_gpu_blocks, num_cpu_blocks - - def _warm_up_model(self) -> None: - # IPEX don't support capture graph yet - pass - - def init_worker_distributed_environment(self) -> None: - """Initialize the distributed environment.""" - - parallel_config = self.parallel_config - rank = self.rank - distributed_init_method = self.distributed_init_method - - if torch.distributed.is_initialized(): - torch_world_size = torch.distributed.get_world_size() - if torch_world_size != parallel_config.world_size: - raise RuntimeError( - "torch.distributed is already initialized but the torch " - "world size does not match parallel_config.world_size " - f"({torch_world_size} vs. {parallel_config.world_size}).") - elif not distributed_init_method: - raise ValueError( - "distributed_init_method must be set if torch.distributed " - "is not already initialized") - else: - # use sockets as default Level zero IPC exchange backend. By - # default oneccl will use `drmfd` as mechanism which need extra - # dependency (libdrm and drm headers) on your system. - ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi") - ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE", - str(parallel_config.world_size)) - os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT - os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE - os.environ["LOCAL_RANK"] = str(self.local_rank) - init_distributed_environment( - world_size=parallel_config.world_size, - rank=rank, - distributed_init_method=distributed_init_method, - local_rank=self.local_rank, - backend="ccl") - - ensure_model_parallel_initialized( - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) - # global all_reduce needed for overall oneccl warm up - torch.distributed.all_reduce(torch.zeros(1).xpu()) - - if parallel_config.pipeline_parallel_size > 1: - # Add pp group init to avoid - # p2p communication as the first call - get_pp_group().all_reduce(torch.zeros(1).xpu()) From cda307df489cf96c26ffd5312a9681d6571b2e0b Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 2 Jul 2025 22:13:09 -0700 Subject: [PATCH 02/13] remove attn backends Signed-off-by: Woosuk Kwon --- vllm/attention/backends/ipex_attn.py | 403 --------------- vllm/attention/backends/torch_sdpa.py | 707 -------------------------- 2 files changed, 1110 deletions(-) delete mode 100644 vllm/attention/backends/ipex_attn.py delete mode 100644 vllm/attention/backends/torch_sdpa.py diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py deleted file mode 100644 index 410ada3b0828..000000000000 --- a/vllm/attention/backends/ipex_attn.py +++ /dev/null @@ -1,403 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" Attention layer with torch scaled_dot_product_attention - and PagedAttention.""" -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch - -from vllm._ipex_ops import ipex_ops -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.utils import CommonAttentionState -from vllm.attention.ops.paged_attn import (PagedAttention, - PagedAttentionMetadata) -from vllm.logger import init_logger - -logger = init_logger(__name__) - -_PARTITION_SIZE = 512 - - -class IpexAttnBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "IPEX" - - @staticmethod - def get_impl_cls() -> Type["IpexAttnBackendImpl"]: - return IpexAttnBackendImpl - - @staticmethod - def get_metadata_cls() -> Type["IpexAttnMetadata"]: - return IpexAttnMetadata - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return PagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - from vllm._ipex_ops import ipex_ops as ops - ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - from vllm._ipex_ops import ipex_ops as ops - key_caches = [kv_cache[0] for kv_cache in kv_caches] - value_caches = [kv_cache[1] for kv_cache in kv_caches] - ops.copy_blocks(key_caches, value_caches, src_to_dists) - - -@dataclass -class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): - """Metadata for IpexAttnBackend. - """ - # Currently, input sequences can only contain all prompts - # or all decoding. True if all sequences are prompts. - is_prompt: bool - slot_mapping: torch.Tensor - seq_lens: Optional[List[int]] - seqlen_q: Optional[torch.Tensor] - max_seqlen: Optional[int] - - def __post_init__(self): - # Set during the execution of the first attention op. - # It is a list because it is needed to set per prompt - # when alibi slopes is used. It is because of the limitation - # from xformer API. - # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[List[torch.Tensor]] = None - - @property - def prefill_metadata(self) -> Optional["IpexAttnMetadata"]: - # Currently chunked prefill is not supported - if self.num_decode_tokens == 0: - assert self.num_prefills > 0 - return self - - return None - - @property - def decode_metadata(self) -> Optional["IpexAttnMetadata"]: - # Currently chunked prefill is not supported - if self.num_prefills > 0: - assert self.num_decode_tokens == 0 - return None - - return self - - -class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0.") - if use_irope: - logger.warning_once( - "Using irope in Ipex is not supported yet, it will fall" - " back to global attention for long context.") - if blocksparse_params is not None: - raise ValueError( - "IPEX backend does not support block-sparse attention.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = sliding_window - self.kv_cache_dtype = kv_cache_dtype - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.need_mask = (self.sliding_window is not None) - if logits_soft_cap is None: - logits_soft_cap = -1 - self.logits_soft_cap = logits_soft_cap - - supported_head_sizes = PagedAttention.get_supported_head_sizes() - if head_size not in supported_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {supported_head_sizes}.") - if is_quantized_kv_cache(kv_cache_dtype): - raise NotImplementedError( - "IPEX backend does not support FP8 KV cache. " - "Please use xFormers backend instead.") - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "IpexAttnBackendImpl") - - def split_kv_cache( - self, - kv_cache: torch.Tensor, - num_kv_heads: int, - head_size: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: - x = 1 - num_blocks = kv_cache.shape[1] - - key_cache = kv_cache[0] - key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, - -1, x) - value_cache = kv_cache[1] - value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) - return key_cache, value_cache - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: IpexAttnMetadata, # type: ignore - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with IPEX varlen_attention and PagedAttention. - - Args: - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - if output_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for IpexAttentionImpl") - - assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 - num_tokens, hidden_size = query.shape - # Reshape the query, key, and value tensors. - query = query.view(-1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - - if kv_cache.numel() > 0: - key_cache, value_cache = self.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - ipex_ops.reshape_and_cache( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping.flatten(), - self.kv_cache_dtype, - layer._k_scale_float, - layer._v_scale_float, - ) - - if attn_metadata.is_prompt: - assert attn_metadata.seq_lens is not None - if (kv_cache.numel() == 0 - or attn_metadata.block_tables.numel() == 0): - if self.num_kv_heads != self.num_heads: - key = key.repeat_interleave(self.num_queries_per_kv, dim=1) - value = value.repeat_interleave(self.num_queries_per_kv, - dim=1) - - if attn_metadata.attn_bias is None: - if self.sliding_window is not None: - att_masks = _make_sliding_window_bias( - attn_metadata.seq_lens, self.sliding_window, - query.dtype) # type: ignore - else: - att_masks = _make_sliding_window_bias( - attn_metadata.seq_lens, None, dtype=query.dtype) - attn_metadata.attn_bias = att_masks - - output = torch.empty( - (num_tokens, self.num_heads, self.head_size), - dtype=query.dtype, - device=query.device) - ipex_ops.varlen_attention( - query, - key, - value, - output, - attn_metadata.seqlen_q, - attn_metadata.seqlen_q, - self.alibi_slopes, - attn_metadata.max_seqlen, - attn_metadata.max_seqlen, - pdropout=0.0, - softmax_scale=self.scale, - zero_tensors=False, - is_causal=True, - return_softmax=False, - gen_=None, - window_size_left=-1, - window_size_right=-1, - logits_soft_cap=self.logits_soft_cap, - ) - else: - # prefix-enabled attention - raise RuntimeError( - "IPEX backend doesn't support prefix decoding.") - - else: - # Decoding run. - max_seq_len = attn_metadata.max_decode_seq_len - output = torch.empty_like(query) - block_size = value_cache.shape[3] - num_seqs, num_heads, head_size = query.shape - max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) // - _PARTITION_SIZE) - # NOTE(woosuk): We use a simple heuristic to decide whether to use - # PagedAttention V1 or V2. If the number of partitions is 1, we use - # V1 to avoid the overhead of reduction. Also, if the number of - # sequences or heads is large, we use V1 since there is enough work - # to parallelize. - # TODO(woosuk): Tune this heuristic. - # For context len > 8192, use V2 kernel to avoid shared memory - # shortage. - use_v1 = (max_seq_len <= 8192 and - (max_num_partitions == 1 or num_seqs * num_heads > 512)) - if use_v1: - # Run PagedAttention V1. - ipex_ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - self.num_kv_heads, - self.scale, - attn_metadata.block_tables, - attn_metadata.seq_lens_tensor, - block_size, - max_seq_len, - self.alibi_slopes, - self.kv_cache_dtype, - layer._k_scale_float, - layer._v_scale_float, - ) - else: - # Run PagedAttention V2. - assert _PARTITION_SIZE % block_size == 0 - tmp_output = torch.empty( - size=(num_seqs, num_heads, max_num_partitions, head_size), - dtype=output.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, max_num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - ipex_ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - self.num_kv_heads, - self.scale, - attn_metadata.block_tables, - attn_metadata.seq_lens_tensor, - block_size, - max_seq_len, - self.alibi_slopes, - self.kv_cache_dtype, - layer._k_scale_float, - layer._v_scale_float, - ) - - # Reshape the output tensor. - return output.view(-1, self.num_heads * self.head_size) - - -def _make_alibi_bias( - alibi_slopes: torch.Tensor, - dtype: torch.dtype, - seq_lens: List[int], -) -> List[torch.Tensor]: - attn_biases = [] - for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(seq_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - bias = bias[None, :] - bias[:, None] - - num_heads = alibi_slopes.shape[0] - bias = bias[None, :].repeat((num_heads, 1, 1)) - bias.mul_(alibi_slopes[:, None, None]) - inf_mask = torch.empty( - (1, seq_len, seq_len), - dtype=bias.dtype, - device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1) - attn_biases.append((bias + inf_mask).to(dtype)) - - return attn_biases - - -def _make_sliding_window_bias( - seq_lens: List[int], - window_size: Optional[int], - dtype: torch.dtype, -) -> List[torch.Tensor]: - attn_biases = [] - for seq_len in seq_lens: - tensor = torch.full( - (1, seq_len, seq_len), - dtype=dtype, - fill_value=1, - ) - shift = 0 - mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore - if window_size is not None: - mask = torch.triu(mask, diagonal=shift - window_size + 1) - mask = torch.log(mask) - attn_biases.append(mask.to(dtype)) - - return attn_biases diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py deleted file mode 100644 index af5fe81dc883..000000000000 --- a/vllm/attention/backends/torch_sdpa.py +++ /dev/null @@ -1,707 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" Attention layer with torch scaled_dot_product_attention - and PagedAttention.""" -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch -from torch.nn.functional import scaled_dot_product_attention - -# yapf conflicts with isort for this block -# yapf: disable -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, - AttentionMetadataBuilder, - AttentionType, - is_quantized_kv_cache) -# yapf: enable -from vllm.attention.backends.utils import CommonAttentionState -from vllm.attention.ops.ipex_attn import PagedAttention, _use_ipex -from vllm.attention.ops.paged_attn import PagedAttentionMetadata -from vllm.logger import init_logger -from vllm.utils import make_tensor_with_pad -from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder - -logger = init_logger(__name__) - - -class TorchSDPABackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "TORCH_SDPA" - - @staticmethod - def get_impl_cls() -> Type["TorchSDPABackendImpl"]: - return TorchSDPABackendImpl - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return TorchSDPAMetadata - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_builder_cls() -> Type["TorchSDPAMetadataBuilder"]: - return TorchSDPAMetadataBuilder - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return PagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - raise NotImplementedError("Swap is not supported in TorchSDPABackend.") - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - PagedAttention.copy_blocks(kv_caches, src_to_dists) - - -@dataclass -class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): - """Metadata for TorchSDPABackend. - """ - # Currently, input sequences can only contain all prompts - # or all decoding. True if all sequences are prompts. - chunked_prefill: bool - seq_lens: Optional[List[int]] = None # For non-chunked prefill - - # For chunked prefill only - max_query_len: Optional[int] = None - max_kv_len: Optional[int] = None - prefill_query_start_loc: Optional[torch.Tensor] = None - kv_start_loc: Optional[torch.Tensor] = None - prefill_block_tables: Optional[torch.Tensor] = None - - # For V1 logits index only - query_start_loc: Optional[torch.Tensor] = None - - # Begin encoder attn & enc/dec cross-attn fields... - # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - def __post_init__(self): - # Set during the execution of the first attention op. - # It is a list because it is needed to set per prompt - # when alibi slopes is used. It is because of the limitation - # from xformer API. - # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[List[torch.Tensor]] = None - self.encoder_attn_bias: Optional[List[torch.Tensor]] = None - self.cross_attn_bias: Optional[List[torch.Tensor]] = None - - @property - def is_all_encoder_attn_metadata_set(self): - ''' - All attention metadata required for encoder attention is set. - ''' - return ((self.encoder_seq_lens is not None) - and (self.encoder_seq_lens_tensor is not None) - and (self.max_encoder_seq_len is not None)) - - @property - def is_all_cross_attn_metadata_set(self): - ''' - All attention metadata required for enc/dec cross-attention is set. - - Superset of encoder attention required metadata. - ''' - return (self.is_all_encoder_attn_metadata_set - and (self.cross_slot_mapping is not None) - and (self.cross_block_tables is not None)) - - @property - def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]: - if self.num_prefill_tokens == 0: - return None - return self - - @property - def decode_metadata(self) -> Optional["TorchSDPAMetadata"]: - if self.num_decode_tokens == 0: - return None - return self - - def get_seq_lens( - self, - attn_type: str, - ): - ''' - Extract appropriate sequence lengths from attention metadata - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - * Appropriate sequence lengths tensor for query - * Appropriate sequence lengths tensor for key & value - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - seq_lens_q = self.seq_lens - seq_lens_kv = self.seq_lens - elif attn_type == AttentionType.ENCODER: - seq_lens_q = self.encoder_seq_lens - seq_lens_kv = self.encoder_seq_lens - elif attn_type == AttentionType.ENCODER_DECODER: - seq_lens_q = self.seq_lens - seq_lens_kv = self.encoder_seq_lens - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - return seq_lens_q, seq_lens_kv - - def get_attn_bias( - self, - attn_type: str, - ) -> Optional[List[torch.Tensor]]: - ''' - Extract appropriate attention bias from attention metadata - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - * Appropriate attention bias value given the attention type - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - return self.attn_bias - elif attn_type == AttentionType.ENCODER: - return self.encoder_attn_bias - elif attn_type == AttentionType.ENCODER_DECODER: - return self.cross_attn_bias - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - def set_attn_bias( - self, - attn_bias: List[torch.Tensor], - attn_type: str, - ) -> None: - ''' - Update appropriate attention bias field of attention metadata, - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_bias: The desired attention bias value - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - self.attn_bias = attn_bias - elif attn_type == AttentionType.ENCODER: - self.encoder_attn_bias = attn_bias - elif attn_type == AttentionType.ENCODER_DECODER: - self.cross_attn_bias = attn_bias - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - def get_seq_len_block_table_args( - self, - attn_type: str, - ) -> tuple: - ''' - The particular choice of sequence-length- and block-table-related - attributes which should be extracted from attn_metadata is dependent - on the type of attention operation. - - Decoder attn -> select entirely decoder self-attention-related fields - Encoder/decoder cross-attn -> select encoder sequence lengths & - cross-attn block-tables fields - Encoder attn -> select encoder sequence lengths fields & no block tables - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * is_prompt: True if prefill, False otherwise - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - - * Appropriate sequence-lengths tensor - * Appropriate max sequence-length scalar - * Appropriate block tables (or None) - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - # Decoder self-attention - # Choose max_seq_len based on whether we are in prompt_run - return (self.seq_lens_tensor, self.max_decode_seq_len, - self.block_tables) - elif attn_type == AttentionType.ENCODER_DECODER: - # Enc/dec cross-attention KVs match encoder sequence length; - # cross-attention utilizes special "cross" block tables - return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, - self.cross_block_tables) - elif attn_type == AttentionType.ENCODER: - # No block tables associated with encoder attention - return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, - None) - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - -class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]): - - def __init__(self, input_builder: ModelInputForCPUBuilder) -> None: - self.chunked_prefill = input_builder.chunked_prefill - self.input_builder = input_builder - - def prepare(self): - self.input_data = self.input_builder.input_data - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata: - input_data = self.input_data - prefill_seq_lens = seq_lens[0:input_data.num_prefills] - prefill_query_lens = query_lens[0:input_data.num_prefills] - slot_mapping = torch.tensor(input_data.slot_mapping, - dtype=torch.long, - device="cpu") - - # For chunked-prefill - if self.chunked_prefill and input_data.num_prefill_tokens != 0: - prefill_block_tables = make_tensor_with_pad( - self.input_data.prefill_block_tables, - pad=0, - dtype=torch.int32, - device="cpu", - ) - query_lens_tensor = torch.tensor(prefill_query_lens, - dtype=torch.int32, - device="cpu") - kv_lens_tensor = torch.tensor(prefill_seq_lens, - dtype=torch.int32, - device="cpu") - query_start_loc = torch.zeros(input_data.num_prefills + 1, - dtype=torch.int32, - device="cpu") - kv_start_loc = torch.zeros(input_data.num_prefills + 1, - dtype=torch.int32, - device="cpu") - torch.cumsum(query_lens_tensor, - dim=0, - dtype=torch.int32, - out=query_start_loc[1:]) - torch.cumsum(kv_lens_tensor, - dim=0, - dtype=torch.int32, - out=kv_start_loc[1:]) - max_query_len = max(prefill_query_lens) - max_kv_len = max(prefill_seq_lens) - else: - prefill_block_tables = None - query_start_loc = None - kv_start_loc = None - max_query_len = None - max_kv_len = None - - # For paged attention - if input_data.num_decode_tokens != 0: - seq_lens_tensor = torch.tensor( - input_data.seq_lens[input_data.num_prefills:], - dtype=torch.int32, - device="cpu", - ) - block_tables = make_tensor_with_pad( - self.input_data.decode_block_tables, - pad=0, - dtype=torch.int32, - device="cpu", - ) - else: - block_tables = torch.tensor([]) - seq_lens_tensor = torch.tensor( - input_data.seq_lens[:input_data.num_prefills], - dtype=torch.int32, - device="cpu", - ) - - # For multi-modal models - placeholder_index_maps = None - if len(input_data.multi_modal_inputs_list) != 0: - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - input_data.multi_modal_placeholder_maps.items() - } - - attn_metadata = TorchSDPAMetadata( - chunked_prefill=self.chunked_prefill, - seq_lens=prefill_seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - max_kv_len=max_kv_len, - prefill_query_start_loc=query_start_loc, - kv_start_loc=kv_start_loc, - max_decode_seq_len=input_data.max_decode_seq_len, - num_prefills=input_data.num_prefills, - num_prefill_tokens=input_data.num_prefill_tokens, - num_decode_tokens=input_data.num_decode_tokens, - block_tables=block_tables, - prefill_block_tables=prefill_block_tables, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=False, - ) - - return attn_metadata - - -class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0.") - if blocksparse_params is not None: - raise ValueError( - "Torch SPDA does not support block-sparse attention.") - if logits_soft_cap is not None: - logger.warning_once("Torch SPDA does not support logits soft cap. " - "Outputs may be slightly off.") - if use_irope: - logger.warning_once( - "Using irope in Torch SPDA is not supported yet, it will fall" - " back to global attention for long context.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = sliding_window - self.kv_cache_dtype = kv_cache_dtype - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.need_mask = (self.alibi_slopes is not None - or self.sliding_window is not None) - - supported_head_sizes = PagedAttention.get_supported_head_sizes() - if head_size not in supported_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {supported_head_sizes}.") - - if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex: - raise NotImplementedError( - "Torch SDPA backend FP8 KV cache requires " - "intel_extension_for_pytorch support.") - self.attn_type = attn_type - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: TorchSDPAMetadata, # type: ignore - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with torch SDPA and PagedAttention. - - Args: - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - if output_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for TorchSDPABackendImpl") - - # For warming-up - if attn_metadata is None: - return query - - attn_type = self.attn_type - if (attn_type == AttentionType.ENCODER - and (not attn_metadata.is_all_encoder_attn_metadata_set)): - raise AttributeError("Encoder attention requires setting " - "encoder metadata attributes.") - elif (attn_type == AttentionType.ENCODER_DECODER - and (not attn_metadata.is_all_cross_attn_metadata_set)): - raise AttributeError("Encoder/decoder cross-attention " - "requires setting cross-attention " - "metadata attributes.") - - # Reshape the query, key, and value tensors. - query = query.view(-1, self.num_heads, self.head_size) - if key is not None: - assert value is not None - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - else: - assert value is None - - if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0): - # KV-cache during decoder-self- or - # encoder-decoder-cross-attention, but not - # during encoder attention. - # - # Even if there are no new key/value pairs to cache, - # we still need to break out key_cache and value_cache - # i.e. for later use by paged attention - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - - if (key is not None) and (value is not None): - if attn_type == AttentionType.ENCODER_DECODER: - # Update cross-attention KV cache (prefill-only) - # During cross-attention decode, key & value will be None, - # preventing this IF-statement branch from running - updated_slot_mapping = attn_metadata.cross_slot_mapping - else: - # Update self-attention KV cache (prefill/decode) - updated_slot_mapping = attn_metadata.slot_mapping - - PagedAttention.write_to_paged_cache( - key, value, key_cache, value_cache, updated_slot_mapping, - self.kv_cache_dtype, layer._k_scale, layer._v_scale) - - if attn_type != AttentionType.ENCODER: - # Decoder self-attention supports chunked prefill. - # Encoder/decoder cross-attention requires no chunked - # prefill (100% prefill or 100% decode tokens, no mix) - num_prefill_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - else: - # Encoder attention - chunked prefill is not applicable; - # derive token-count from query shape & and treat them - # as 100% prefill tokens - assert attn_metadata.num_encoder_tokens is not None - num_prefill_tokens = attn_metadata.num_encoder_tokens - num_decode_tokens = 0 - - if attn_type == AttentionType.DECODER: - # Only enforce this shape-constraint for decoder - # self-attention - assert key.shape[0] == num_prefill_tokens + num_decode_tokens - assert value.shape[0] == num_prefill_tokens + num_decode_tokens - - output = torch.empty_like(query) - if prefill_meta := attn_metadata.prefill_metadata: - if not prefill_meta.prefill_metadata.chunked_prefill: # type: ignore - assert attn_metadata.seq_lens is not None - self._run_sdpa_forward(output, - query, - key, - value, - prefill_meta, - attn_type=attn_type) - else: - # prefix-enabled attention - assert not self.need_mask - import intel_extension_for_pytorch.llm.modules as ipex_modules - output = torch.empty_like(query) - ipex_modules.PagedAttention.flash_attn_varlen_func( - output[:prefill_meta.num_prefill_tokens, :, :], - query[:prefill_meta.num_prefill_tokens, :, :], - key_cache, - value_cache, - prefill_meta.prefill_query_start_loc, - prefill_meta.kv_start_loc, - prefill_meta.max_query_len, - prefill_meta.max_kv_len, - self.scale, - True, - prefill_meta.prefill_block_tables, - self.alibi_slopes, - ) - - if decode_meta := attn_metadata.decode_metadata: - assert attn_type != AttentionType.ENCODER_ONLY, ( - "Encoder-only models should not have decode metadata.") - # Decoding run. - ( - seq_lens_arg, - max_seq_len_arg, - block_tables_arg, - ) = decode_meta.get_seq_len_block_table_args(attn_type) - - PagedAttention.forward_decode( - output[attn_metadata.num_prefill_tokens:, :, :], - query[attn_metadata.num_prefill_tokens:, :, :], - key_cache, - value_cache, - block_tables_arg, - seq_lens_arg, - max_seq_len_arg, - self.kv_cache_dtype, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - layer._k_scale, - layer._v_scale, - ) - - # Reshape the output tensor. - return output.view(-1, self.num_heads * self.head_size) - - def _run_sdpa_forward( - self, - output: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_metadata: TorchSDPAMetadata, - attn_type: str = AttentionType.DECODER, - ) -> None: - if self.num_kv_heads != self.num_heads: - key = key.repeat_interleave(self.num_queries_per_kv, dim=1) - value = value.repeat_interleave(self.num_queries_per_kv, dim=1) - - attn_masks = attn_metadata.get_attn_bias(attn_type) - if attn_masks is None: - if self.alibi_slopes is not None: - attn_masks = _make_alibi_bias( - self.alibi_slopes, query.dtype, - attn_metadata.seq_lens) # type: ignore - elif self.sliding_window is not None: - assert attn_metadata.seq_lens is not None - attn_masks = _make_sliding_window_bias( - attn_metadata.seq_lens, self.sliding_window, - query.dtype) # type: ignore - else: - seq_lens, _ = attn_metadata.get_seq_lens(attn_type) - attn_masks = [None] * len(seq_lens) - attn_metadata.set_attn_bias(attn_masks, attn_type) - - query = query.movedim(0, query.dim() - 2) - key = key.movedim(0, key.dim() - 2) - value = value.movedim(0, value.dim() - 2) - - causal_attn = (attn_type == AttentionType.DECODER) - - seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type) - start_q, start_kv = 0, 0 - for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv, - attn_masks): - end_q = start_q + seq_len_q - end_kv = start_kv + seq_len_kv - sub_out = scaled_dot_product_attention( - query[None, :, start_q:end_q, :], - key[None, :, start_kv:end_kv, :], - value[None, :, start_kv:end_kv, :], - attn_mask=mask, - dropout_p=0.0, - is_causal=causal_attn and mask is None, - scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0) - output[start_q:end_q, :, :] = sub_out - start_q, start_kv = end_q, end_kv - - -def _make_alibi_bias( - alibi_slopes: torch.Tensor, - dtype: torch.dtype, - seq_lens: List[int], -) -> List[torch.Tensor]: - attn_biases: List[torch.Tensor] = [] - for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(seq_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - bias = bias[None, :] - bias[:, None] - - num_heads = alibi_slopes.shape[0] - bias = bias[None, :].repeat((num_heads, 1, 1)) - bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0) - inf_mask = torch.empty( - (1, seq_len, seq_len), - dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1) - attn_biases.append((bias + inf_mask).to(dtype)) - - return attn_biases - - -def _make_sliding_window_bias( - seq_lens: List[int], - window_size: Optional[int], - dtype: torch.dtype, -) -> List[torch.Tensor]: - attn_biases: List[torch.Tensor] = [] - for seq_len in seq_lens: - tensor = torch.full( - (1, seq_len, seq_len), - dtype=dtype, - fill_value=1, - ) - shift = 0 - mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore - if window_size is not None: - mask = torch.triu(mask, diagonal=shift - window_size + 1) - mask = torch.log(mask) - attn_biases.append(mask.to(dtype)) - - return attn_biases From 9e94110c431b0b84d319b2eba5ac83d94caa3012 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 2 Jul 2025 22:17:58 -0700 Subject: [PATCH 03/13] fix cpu Signed-off-by: Woosuk Kwon --- vllm/attention/backends/cpu_mla.py | 307 ----------------------------- vllm/platforms/cpu.py | 26 +-- 2 files changed, 6 insertions(+), 327 deletions(-) delete mode 100644 vllm/attention/backends/cpu_mla.py diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py deleted file mode 100644 index 793cb87b7434..000000000000 --- a/vllm/attention/backends/cpu_mla.py +++ /dev/null @@ -1,307 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch - -import vllm._custom_ops as ops -from vllm._ipex_ops import ipex_ops -from vllm.attention.backends.abstract import (AttentionBackend, - AttentionMetadataBuilder, - AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.mla.common import MLACommonImpl, MLACommonState -from vllm.attention.backends.torch_sdpa import TorchSDPAMetadata -from vllm.utils import make_tensor_with_pad -from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder - - -class CPUMLABackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "CPU_MLA" - - @staticmethod - def get_metadata_cls() -> Type["CPUMLAMetadata"]: - return CPUMLAMetadata - - @staticmethod - def get_builder_cls() -> Type["CPUMLAMetadataBuilder"]: - return CPUMLAMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["MLACommonState"]: - return MLACommonState - - @staticmethod - def get_impl_cls() -> Type["CPUMLAImpl"]: - return CPUMLAImpl - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, # assumed to be 1 for MLA - head_size: int, - ) -> Tuple[int, ...]: - return (num_blocks, block_size, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - ops.copy_blocks_mla(kv_caches, src_to_dists) - - @staticmethod - def get_supported_head_sizes() -> List[int]: - return [576] - - -@dataclass -class CPUMLAMetadata(TorchSDPAMetadata): - # New for MLA - # Input positions for rotrary embeddings since for MLA the rotary - # position embeddings are applied inside the attention backend - input_positions: torch.Tensor = None - - # required by MLACommonImpl - is_profile_run: bool = False - - -class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]): - - def __init__(self, input_builder: ModelInputForCPUBuilder) -> None: - self.chunked_prefill = input_builder.chunked_prefill - self.input_builder = input_builder - assert not self.chunked_prefill, \ - "chunked prefill is currently not supported" - - def prepare(self): - self.input_data = self.input_builder.input_data - - def build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size): - input_data = self.input_data - prefill_seq_lens = seq_lens[0:input_data.num_prefills] - prefill_query_lens = query_lens[0:input_data.num_prefills] - slot_mapping = torch.tensor(input_data.slot_mapping, - dtype=torch.long, - device="cpu") - - # metadata for prefill - if input_data.num_prefills > 0: - query_lens_tensor = torch.tensor(prefill_query_lens, - dtype=torch.int32, - device="cpu") - kv_lens_tensor = torch.tensor(prefill_seq_lens, - dtype=torch.int32, - device="cpu") - query_start_loc = torch.zeros(input_data.num_prefills + 1, - dtype=torch.int32, - device="cpu") - kv_start_loc = torch.zeros(input_data.num_prefills + 1, - dtype=torch.int32, - device="cpu") - torch.cumsum(query_lens_tensor, - dim=0, - dtype=torch.int32, - out=query_start_loc[1:]) - torch.cumsum(kv_lens_tensor, - dim=0, - dtype=torch.int32, - out=kv_start_loc[1:]) - max_query_len = max(prefill_query_lens) - max_kv_len = max(prefill_seq_lens) - - # for chunked-prefill - if self.chunked_prefill: - prefill_block_tables = make_tensor_with_pad( - self.input_data.prefill_block_tables, - pad=0, - dtype=torch.int32, - device="cpu", - ) - else: - prefill_block_tables = None - - else: - query_start_loc = None - kv_start_loc = None - max_query_len = None - max_kv_len = None - prefill_block_tables = None - - # metadata for decode - if input_data.num_decode_tokens != 0: - seq_lens_tensor = torch.tensor( - input_data.seq_lens[input_data.num_prefills:], - dtype=torch.int32, - device="cpu", - ) - block_tables = make_tensor_with_pad( - self.input_data.decode_block_tables, - pad=0, - dtype=torch.int32, - device="cpu", - ) - else: - block_tables = torch.tensor([]) - seq_lens_tensor = torch.tensor( - input_data.seq_lens[:input_data.num_prefills], - dtype=torch.int32, - device="cpu", - ) - - # For multi-modal models - placeholder_index_maps = None - if len(input_data.multi_modal_inputs_list) != 0: - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - input_data.multi_modal_placeholder_maps.items() - } - - return CPUMLAMetadata( - chunked_prefill=self.chunked_prefill, - seq_lens=prefill_seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - max_kv_len=max_kv_len, - prefill_query_start_loc=query_start_loc, - kv_start_loc=kv_start_loc, - max_decode_seq_len=input_data.max_decode_seq_len, - num_prefills=input_data.num_prefills, - num_prefill_tokens=input_data.num_prefill_tokens, - num_decode_tokens=input_data.num_decode_tokens, - block_tables=block_tables, - prefill_block_tables=prefill_block_tables, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=False, - input_positions=torch.tensor([self.input_data.input_positions])) - - -class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], - logits_soft_cap: Optional[float], - attn_type: str, - kv_sharing_target_layer_name: Optional[str], - # MLA Specific Arguments - **mla_args) -> None: - super().__init__(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap, attn_type, - kv_sharing_target_layer_name, **mla_args) - - unsupported_features = [ - alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap - ] - if any(unsupported_features): - raise NotImplementedError( - "CPUMLAImpl does not support one of the following: " - "alibi_slopes, sliding_window, blocksparse_params, " - "logits_soft_cap") - - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "CPUMLAImpl") - - # states is implemented. - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "CPUMLAImpl with FP8 KV cache not yet supported") - - def _forward_prefill( - self, - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: CPUMLAMetadata, # type: ignore[override] - ) -> torch.Tensor: - - prefill_metadata = attn_metadata.prefill_metadata - assert prefill_metadata is not None - - kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\ - -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) - k_nope, v = kv_nope\ - .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) - - # For MLA the v head dim is smaller than qk head dim so we pad out - # v with 0s to match the qk head dim - v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]], - value=0) - - output = torch.empty_like(q) - ipex_ops.varlen_attention( - query=q, - key=k, - value=v_padded, - out=output, - seqlen_q=prefill_metadata.prefill_query_start_loc, - seqlen_k=prefill_metadata.prefill_query_start_loc, - max_seqlen_q=prefill_metadata.max_query_len, - max_seqlen_k=prefill_metadata.max_query_len, - pdropout=0.0, - softmax_scale=self.scale, - zero_tensors=False, - is_causal=True, - return_softmax=False, - gen_=None, - logits_soft_cap=0.0, - window_size_left=-1, - window_size_right=-1, - alibi_slopes=None, - ) - - # remove padding - output = output.view(-1, self.num_heads, - q.shape[-1])[..., :v.shape[-1]] - return output.reshape(-1, self.num_heads * v.shape[-1]) - - def _forward_decode( - self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: CPUMLAMetadata, # type: ignore[override] - ) -> torch.Tensor: - assert kv_c_and_k_pe_cache.numel() > 0 - - decode_meta = attn_metadata.decode_metadata - assert decode_meta is not None - - q = torch.cat([q_nope, q_pe], dim=-1) - o = q.new_empty(q.shape[0], self.num_heads, self.kv_lora_rank) - - # Run MQA - ops.mla_decode_kvcache_cpu(o, q, kv_c_and_k_pe_cache, self.scale, - decode_meta.block_tables, - decode_meta.seq_lens_tensor) - return self._v_up_proj(o) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index dccd60f4463a..1050d3c59344 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -64,13 +64,11 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, if selected_backend and selected_backend != _Backend.TORCH_SDPA: logger.info("Cannot use %s backend on CPU.", selected_backend) if use_mla: - logger.info("Using CPU MLA backend.") - return "vllm.attention.backends.cpu_mla.CPUMLABackend" + raise NotImplementedError("MLA is not supported on CPU.") logger.info("Using Torch SDPA backend.") - if use_v1: - return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend" - else: - return "vllm.attention.backends.torch_sdpa.TorchSDPABackend" + if not use_v1: + raise ValueError("CPU backend only supports V1.") + return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend" @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: @@ -147,26 +145,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config.distributed_executor_backend) parallel_config.distributed_executor_backend = "mp" if parallel_config.worker_cls == "auto": - if vllm_config.speculative_config: - parallel_config.worker_cls = \ - "vllm.spec_decode.spec_decode_worker.create_spec_worker" - parallel_config.sd_worker_cls = \ - "vllm.worker.cpu_worker.CPUWorker" - else: - if envs.VLLM_USE_V1: - parallel_config.worker_cls = \ - "vllm.v1.worker.cpu_worker.CPUWorker" - else: - parallel_config.worker_cls = \ - "vllm.worker.cpu_worker.CPUWorker" + parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker" # Note: workaround for v1 gpu_model_runner from vllm.config import CompilationLevel vllm_config.compilation_config.cudagraph_capture_sizes = [] compilation_config = vllm_config.compilation_config - if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level - == CompilationLevel.PIECEWISE): + if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE: # Note: vLLM V1 is using PIECEWISE level compilation, which will # take time to compile kernels just-in-time with the inductor From d28a003b1888e67256905bebdd87cf13716b3884 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 2 Jul 2025 22:19:17 -0700 Subject: [PATCH 04/13] fix xpu Signed-off-by: Woosuk Kwon --- vllm/platforms/xpu.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 61a0453dcbc8..5bd34033233a 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -39,12 +39,10 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, if selected_backend != _Backend.IPEX: logger.info("Cannot use %s backend on XPU.", selected_backend) use_v1 = envs.VLLM_USE_V1 - if use_v1: - logger.info("Using Flash Attention backend on V1 engine.") - return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" - else: - logger.info("Using IPEX attention backend.") - return "vllm.attention.backends.ipex_attn.IpexAttnBackend" + if not use_v1: + raise ValueError("XPU backend only supports V1.") + logger.info("Using Flash Attention backend on V1 engine.") + return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" @classmethod def get_device_capability( @@ -77,10 +75,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config = vllm_config.cache_config # in V1(or with ipex chunked prefill) block_size is 64 if cache_config and cache_config.block_size is None: - if envs.VLLM_USE_V1: - cache_config.block_size = 64 - else: - cache_config.block_size = 16 + cache_config.block_size = 64 # Instances created using VllmConfig() typically have model_config as # None by default. The modification involves adding a check to prevent @@ -106,11 +101,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: # check and update parallel config parallel_config = vllm_config.parallel_config - if envs.VLLM_USE_V1: - parallel_config.worker_cls =\ - "vllm.v1.worker.xpu_worker.XPUWorker" - else: - parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" + parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker" if parallel_config.distributed_executor_backend is None: if parallel_config.world_size > 1: From 01eb1fff98cde16f6dc794e318b7ecb29b209619 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 2 Jul 2025 23:07:13 -0700 Subject: [PATCH 05/13] Remove --dtype float32 Signed-off-by: Woosuk Kwon --- examples/online_serving/chart-helm/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml index 28dba9a6f688..c7581b5dcee0 100644 --- a/examples/online_serving/chart-helm/values.yaml +++ b/examples/online_serving/chart-helm/values.yaml @@ -8,7 +8,7 @@ image: # -- Image tag tag: "latest" # -- Container launch command - command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"] + command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"] # -- Container port containerPort: 8000 From 5203865c1e8bca016a420def217257b9f2eac5dc Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 2 Jul 2025 23:24:33 -0700 Subject: [PATCH 06/13] fix Signed-off-by: Woosuk Kwon --- vllm/attention/backends/torch_sdpa.py | 546 ++++++++++++++++++++++++++ 1 file changed, 546 insertions(+) create mode 100644 vllm/attention/backends/torch_sdpa.py diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py new file mode 100644 index 000000000000..a490aa397991 --- /dev/null +++ b/vllm/attention/backends/torch_sdpa.py @@ -0,0 +1,546 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" Attention layer with torch scaled_dot_product_attention + and PagedAttention.""" +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +import torch +from torch.nn.functional import scaled_dot_product_attention + +# yapf conflicts with isort for this block +# yapf: disable +from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer, + AttentionMetadata, AttentionType, + is_quantized_kv_cache) +# yapf: enable +from vllm.attention.ops.ipex_attn import PagedAttention, _use_ipex +from vllm.attention.ops.paged_attn import PagedAttentionMetadata +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@dataclass +class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): + """Metadata for TorchSDPABackend. + """ + # Currently, input sequences can only contain all prompts + # or all decoding. True if all sequences are prompts. + chunked_prefill: bool + seq_lens: Optional[List[int]] = None # For non-chunked prefill + + # For chunked prefill only + max_query_len: Optional[int] = None + max_kv_len: Optional[int] = None + prefill_query_start_loc: Optional[torch.Tensor] = None + kv_start_loc: Optional[torch.Tensor] = None + prefill_block_tables: Optional[torch.Tensor] = None + + # For V1 logits index only + query_start_loc: Optional[torch.Tensor] = None + + # Begin encoder attn & enc/dec cross-attn fields... + # Encoder sequence lengths representation + encoder_seq_lens: Optional[List[int]] = None + encoder_seq_lens_tensor: Optional[torch.Tensor] = None + + # Maximum sequence length among encoder sequences + max_encoder_seq_len: Optional[int] = None + + # Number of tokens input to encoder + num_encoder_tokens: Optional[int] = None + + # Cross-attention memory-mapping data structures: slot mapping + # and block tables + cross_slot_mapping: Optional[torch.Tensor] = None + cross_block_tables: Optional[torch.Tensor] = None + + def __post_init__(self): + # Set during the execution of the first attention op. + # It is a list because it is needed to set per prompt + # when alibi slopes is used. It is because of the limitation + # from xformer API. + # will not appear in the __repr__ and __init__ + self.attn_bias: Optional[List[torch.Tensor]] = None + self.encoder_attn_bias: Optional[List[torch.Tensor]] = None + self.cross_attn_bias: Optional[List[torch.Tensor]] = None + + @property + def is_all_encoder_attn_metadata_set(self): + ''' + All attention metadata required for encoder attention is set. + ''' + return ((self.encoder_seq_lens is not None) + and (self.encoder_seq_lens_tensor is not None) + and (self.max_encoder_seq_len is not None)) + + @property + def is_all_cross_attn_metadata_set(self): + ''' + All attention metadata required for enc/dec cross-attention is set. + + Superset of encoder attention required metadata. + ''' + return (self.is_all_encoder_attn_metadata_set + and (self.cross_slot_mapping is not None) + and (self.cross_block_tables is not None)) + + @property + def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]: + if self.num_prefill_tokens == 0: + return None + return self + + @property + def decode_metadata(self) -> Optional["TorchSDPAMetadata"]: + if self.num_decode_tokens == 0: + return None + return self + + def get_seq_lens( + self, + attn_type: str, + ): + ''' + Extract appropriate sequence lengths from attention metadata + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + * Appropriate sequence lengths tensor for query + * Appropriate sequence lengths tensor for key & value + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + seq_lens_q = self.seq_lens + seq_lens_kv = self.seq_lens + elif attn_type == AttentionType.ENCODER: + seq_lens_q = self.encoder_seq_lens + seq_lens_kv = self.encoder_seq_lens + elif attn_type == AttentionType.ENCODER_DECODER: + seq_lens_q = self.seq_lens + seq_lens_kv = self.encoder_seq_lens + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + return seq_lens_q, seq_lens_kv + + def get_attn_bias( + self, + attn_type: str, + ) -> Optional[List[torch.Tensor]]: + ''' + Extract appropriate attention bias from attention metadata + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + * Appropriate attention bias value given the attention type + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + return self.attn_bias + elif attn_type == AttentionType.ENCODER: + return self.encoder_attn_bias + elif attn_type == AttentionType.ENCODER_DECODER: + return self.cross_attn_bias + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + def set_attn_bias( + self, + attn_bias: List[torch.Tensor], + attn_type: str, + ) -> None: + ''' + Update appropriate attention bias field of attention metadata, + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_bias: The desired attention bias value + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + self.attn_bias = attn_bias + elif attn_type == AttentionType.ENCODER: + self.encoder_attn_bias = attn_bias + elif attn_type == AttentionType.ENCODER_DECODER: + self.cross_attn_bias = attn_bias + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + def get_seq_len_block_table_args( + self, + attn_type: str, + ) -> tuple: + ''' + The particular choice of sequence-length- and block-table-related + attributes which should be extracted from attn_metadata is dependent + on the type of attention operation. + + Decoder attn -> select entirely decoder self-attention-related fields + Encoder/decoder cross-attn -> select encoder sequence lengths & + cross-attn block-tables fields + Encoder attn -> select encoder sequence lengths fields & no block tables + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * is_prompt: True if prefill, False otherwise + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + + * Appropriate sequence-lengths tensor + * Appropriate max sequence-length scalar + * Appropriate block tables (or None) + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + # Decoder self-attention + # Choose max_seq_len based on whether we are in prompt_run + return (self.seq_lens_tensor, self.max_decode_seq_len, + self.block_tables) + elif attn_type == AttentionType.ENCODER_DECODER: + # Enc/dec cross-attention KVs match encoder sequence length; + # cross-attention utilizes special "cross" block tables + return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, + self.cross_block_tables) + elif attn_type == AttentionType.ENCODER: + # No block tables associated with encoder attention + return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, + None) + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + +class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + use_irope: bool = False, + ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") + if blocksparse_params is not None: + raise ValueError( + "Torch SPDA does not support block-sparse attention.") + if logits_soft_cap is not None: + logger.warning_once("Torch SPDA does not support logits soft cap. " + "Outputs may be slightly off.") + if use_irope: + logger.warning_once( + "Using irope in Torch SPDA is not supported yet, it will fall" + " back to global attention for long context.") + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + self.sliding_window = sliding_window + self.kv_cache_dtype = kv_cache_dtype + + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.need_mask = (self.alibi_slopes is not None + or self.sliding_window is not None) + + supported_head_sizes = PagedAttention.get_supported_head_sizes() + if head_size not in supported_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {supported_head_sizes}.") + + if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex: + raise NotImplementedError( + "Torch SDPA backend FP8 KV cache requires " + "intel_extension_for_pytorch support.") + self.attn_type = attn_type + + def forward( + self, + layer: AttentionLayer, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: TorchSDPAMetadata, # type: ignore + output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Forward pass with torch SDPA and PagedAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + NOTE: kv_cache will be an empty tensor with shape [0] + for profiling run. + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for TorchSDPABackendImpl") + + # For warming-up + if attn_metadata is None: + return query + + attn_type = self.attn_type + if (attn_type == AttentionType.ENCODER + and (not attn_metadata.is_all_encoder_attn_metadata_set)): + raise AttributeError("Encoder attention requires setting " + "encoder metadata attributes.") + elif (attn_type == AttentionType.ENCODER_DECODER + and (not attn_metadata.is_all_cross_attn_metadata_set)): + raise AttributeError("Encoder/decoder cross-attention " + "requires setting cross-attention " + "metadata attributes.") + + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + if key is not None: + assert value is not None + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + else: + assert value is None + + if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0): + # KV-cache during decoder-self- or + # encoder-decoder-cross-attention, but not + # during encoder attention. + # + # Even if there are no new key/value pairs to cache, + # we still need to break out key_cache and value_cache + # i.e. for later use by paged attention + key_cache, value_cache = PagedAttention.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + + if (key is not None) and (value is not None): + if attn_type == AttentionType.ENCODER_DECODER: + # Update cross-attention KV cache (prefill-only) + # During cross-attention decode, key & value will be None, + # preventing this IF-statement branch from running + updated_slot_mapping = attn_metadata.cross_slot_mapping + else: + # Update self-attention KV cache (prefill/decode) + updated_slot_mapping = attn_metadata.slot_mapping + + PagedAttention.write_to_paged_cache( + key, value, key_cache, value_cache, updated_slot_mapping, + self.kv_cache_dtype, layer._k_scale, layer._v_scale) + + if attn_type != AttentionType.ENCODER: + # Decoder self-attention supports chunked prefill. + # Encoder/decoder cross-attention requires no chunked + # prefill (100% prefill or 100% decode tokens, no mix) + num_prefill_tokens = attn_metadata.num_prefill_tokens + num_decode_tokens = attn_metadata.num_decode_tokens + else: + # Encoder attention - chunked prefill is not applicable; + # derive token-count from query shape & and treat them + # as 100% prefill tokens + assert attn_metadata.num_encoder_tokens is not None + num_prefill_tokens = attn_metadata.num_encoder_tokens + num_decode_tokens = 0 + + if attn_type == AttentionType.DECODER: + # Only enforce this shape-constraint for decoder + # self-attention + assert key.shape[0] == num_prefill_tokens + num_decode_tokens + assert value.shape[0] == num_prefill_tokens + num_decode_tokens + + output = torch.empty_like(query) + if prefill_meta := attn_metadata.prefill_metadata: + if not prefill_meta.prefill_metadata.chunked_prefill: # type: ignore + assert attn_metadata.seq_lens is not None + self._run_sdpa_forward(output, + query, + key, + value, + prefill_meta, + attn_type=attn_type) + else: + # prefix-enabled attention + assert not self.need_mask + import intel_extension_for_pytorch.llm.modules as ipex_modules + output = torch.empty_like(query) + ipex_modules.PagedAttention.flash_attn_varlen_func( + output[:prefill_meta.num_prefill_tokens, :, :], + query[:prefill_meta.num_prefill_tokens, :, :], + key_cache, + value_cache, + prefill_meta.prefill_query_start_loc, + prefill_meta.kv_start_loc, + prefill_meta.max_query_len, + prefill_meta.max_kv_len, + self.scale, + True, + prefill_meta.prefill_block_tables, + self.alibi_slopes, + ) + + if decode_meta := attn_metadata.decode_metadata: + assert attn_type != AttentionType.ENCODER_ONLY, ( + "Encoder-only models should not have decode metadata.") + # Decoding run. + ( + seq_lens_arg, + max_seq_len_arg, + block_tables_arg, + ) = decode_meta.get_seq_len_block_table_args(attn_type) + + PagedAttention.forward_decode( + output[attn_metadata.num_prefill_tokens:, :, :], + query[attn_metadata.num_prefill_tokens:, :, :], + key_cache, + value_cache, + block_tables_arg, + seq_lens_arg, + max_seq_len_arg, + self.kv_cache_dtype, + self.num_kv_heads, + self.scale, + self.alibi_slopes, + layer._k_scale, + layer._v_scale, + ) + + # Reshape the output tensor. + return output.view(-1, self.num_heads * self.head_size) + + def _run_sdpa_forward( + self, + output: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_metadata: TorchSDPAMetadata, + attn_type: str = AttentionType.DECODER, + ) -> None: + if self.num_kv_heads != self.num_heads: + key = key.repeat_interleave(self.num_queries_per_kv, dim=1) + value = value.repeat_interleave(self.num_queries_per_kv, dim=1) + + attn_masks = attn_metadata.get_attn_bias(attn_type) + if attn_masks is None: + if self.alibi_slopes is not None: + attn_masks = _make_alibi_bias( + self.alibi_slopes, query.dtype, + attn_metadata.seq_lens) # type: ignore + elif self.sliding_window is not None: + assert attn_metadata.seq_lens is not None + attn_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, self.sliding_window, + query.dtype) # type: ignore + else: + seq_lens, _ = attn_metadata.get_seq_lens(attn_type) + attn_masks = [None] * len(seq_lens) + attn_metadata.set_attn_bias(attn_masks, attn_type) + + query = query.movedim(0, query.dim() - 2) + key = key.movedim(0, key.dim() - 2) + value = value.movedim(0, value.dim() - 2) + + causal_attn = (attn_type == AttentionType.DECODER) + + seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type) + start_q, start_kv = 0, 0 + for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv, + attn_masks): + end_q = start_q + seq_len_q + end_kv = start_kv + seq_len_kv + sub_out = scaled_dot_product_attention( + query[None, :, start_q:end_q, :], + key[None, :, start_kv:end_kv, :], + value[None, :, start_kv:end_kv, :], + attn_mask=mask, + dropout_p=0.0, + is_causal=causal_attn and mask is None, + scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0) + output[start_q:end_q, :, :] = sub_out + start_q, start_kv = end_q, end_kv + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + dtype: torch.dtype, + seq_lens: List[int], +) -> List[torch.Tensor]: + attn_biases: List[torch.Tensor] = [] + for seq_len in seq_lens: + bias = torch.arange(seq_len, dtype=dtype) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(seq_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + bias = bias[None, :] - bias[:, None] + + num_heads = alibi_slopes.shape[0] + bias = bias[None, :].repeat((num_heads, 1, 1)) + bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0) + inf_mask = torch.empty( + (1, seq_len, seq_len), + dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1) + attn_biases.append((bias + inf_mask).to(dtype)) + + return attn_biases + + +def _make_sliding_window_bias( + seq_lens: List[int], + window_size: Optional[int], + dtype: torch.dtype, +) -> List[torch.Tensor]: + attn_biases: List[torch.Tensor] = [] + for seq_len in seq_lens: + tensor = torch.full( + (1, seq_len, seq_len), + dtype=dtype, + fill_value=1, + ) + shift = 0 + mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore + if window_size is not None: + mask = torch.triu(mask, diagonal=shift - window_size + 1) + mask = torch.log(mask) + attn_biases.append(mask.to(dtype)) + + return attn_biases From c6235f24f79a4e9d296ebf490429b0fa22f4f5f2 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 2 Jul 2025 23:27:17 -0700 Subject: [PATCH 07/13] fix Signed-off-by: Woosuk Kwon --- .buildkite/scripts/hardware_ci/run-cpu-test.sh | 8 ++++---- .buildkite/scripts/hardware_ci/run-xpu-test.sh | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 42506730e868..737b2eede9c6 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -66,10 +66,10 @@ function cpu_tests() { tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test - docker exec cpu-test-"$NUMA_NODE" bash -c " - set -e - VLLM_USE_V1=0 pytest -s -v \ - tests/quantization/test_ipex_quant.py" + # docker exec cpu-test-"$NUMA_NODE" bash -c " + # set -e + # VLLM_USE_V1=0 pytest -s -v \ + # tests/quantization/test_ipex_quant.py" # Run chunked-prefill and prefix-cache test docker exec cpu-test-"$NUMA_NODE" bash -c " diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 827649bfcf54..cf3aaab8493b 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -26,7 +26,5 @@ docker run \ --name "${container_name}" \ "${image_name}" \ sh -c ' - VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m - VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2 VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager ' From b32187f2446938649d026a24f2bbfd255ebb16f7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 3 Jul 2025 08:01:56 -0700 Subject: [PATCH 08/13] fix attn test Signed-off-by: Woosuk Kwon --- tests/kernels/attention/test_attention_selector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index a8ed749ba13b..73087e09e730 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -36,7 +36,8 @@ def clear_cache(): DEVICE_MLA_BLOCK_SIZES = { "cuda": [16, 64], # CUDA supports both standard and extended block sizes "hip": [16, 1], # HIP requires special handling for block_size=1 - "cpu": [16] # CPU uses fixed block size from test cases + # "cpu": [16] # CPU uses fixed block size from test cases + "cpu": [] # FIXME(woosuk): Temporarily disable CPU tests } From 45878b023302578d88594bbe8d25005957c03ec8 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 3 Jul 2025 08:05:21 -0700 Subject: [PATCH 09/13] fix Signed-off-by: Woosuk Kwon --- examples/online_serving/chart-helm/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml index c7581b5dcee0..9407780bffdf 100644 --- a/examples/online_serving/chart-helm/values.yaml +++ b/examples/online_serving/chart-helm/values.yaml @@ -8,7 +8,7 @@ image: # -- Image tag tag: "latest" # -- Container launch command - command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"] + command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"] # -- Container port containerPort: 8000 From c139974a7967883442930109e03d1a3543f48dcd Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 3 Jul 2025 08:19:48 -0700 Subject: [PATCH 10/13] enforce eager Signed-off-by: Woosuk Kwon --- examples/online_serving/chart-helm/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml index 9407780bffdf..815f02a4bfd5 100644 --- a/examples/online_serving/chart-helm/values.yaml +++ b/examples/online_serving/chart-helm/values.yaml @@ -8,7 +8,7 @@ image: # -- Image tag tag: "latest" # -- Container launch command - command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"] + command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"] # -- Container port containerPort: 8000 From ec0ff9f64b315b3170cf808c4dce735e72aff18b Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Fri, 4 Jul 2025 11:26:30 +0800 Subject: [PATCH 11/13] [V0 deprecation] Remove V0 CPU (#20437) Signed-off-by: jiang1.li --- .../scripts/hardware_ci/run-cpu-test.sh | 24 +- .../models/language/generation/test_common.py | 8 +- .../models/language/pooling/test_embedding.py | 23 +- tests/models/language/pooling/test_reward.py | 5 + tests/quantization/test_compressed_tensors.py | 3 +- vllm/attention/backends/torch_sdpa.py | 546 ------------ vllm/attention/ops/ipex_attn.py | 195 ----- vllm/engine/arg_utils.py | 2 + vllm/v1/attention/backends/cpu_attn.py | 779 +++++++++++++++++- 9 files changed, 811 insertions(+), 774 deletions(-) delete mode 100644 vllm/attention/backends/torch_sdpa.py delete mode 100644 vllm/attention/ops/ipex_attn.py diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 737b2eede9c6..afe3e4b7ef69 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -48,10 +48,16 @@ function cpu_tests() { # Run basic model test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model - pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model - pytest -v -s tests/models/language/generation -m cpu_model - VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model + # Note: disable until supports V1 + # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model + # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model + + # Note: disable Bart until supports V1 + pytest -v -s tests/models/language/generation -m cpu_model \ + --ignore=tests/models/language/generation/test_bart.py + VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \ + --ignore=tests/models/language/generation/test_bart.py + pytest -v -s tests/models/language/pooling -m cpu_model pytest -v -s tests/models/multimodal/generation \ --ignore=tests/models/multimodal/generation/test_mllama.py \ @@ -62,21 +68,15 @@ function cpu_tests() { docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" + # Note: disable it until supports V1 # Run AWQ test # docker exec cpu-test-"$NUMA_NODE" bash -c " # set -e # VLLM_USE_V1=0 pytest -s -v \ # tests/quantization/test_ipex_quant.py" - # Run chunked-prefill and prefix-cache test - docker exec cpu-test-"$NUMA_NODE" bash -c " - set -e - pytest -s -v -k cpu_model \ - tests/basic_correctness/test_chunked_prefill.py" - # online serving docker exec cpu-test-"$NUMA_NODE" bash -c " set -e diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 7d7a62eec118..8aba68829b10 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -39,7 +39,7 @@ [ pytest.param( "bigscience/bloom-560m", # bloom - testing alibi slopes - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + marks=[pytest.mark.core_model], ), pytest.param( "openai-community/gpt2", # gpt2 @@ -87,7 +87,11 @@ pytest.param("bigcode/starcoder2-3b"), # starcoder2 pytest.param( "TitanML/tiny-mixtral", # mixtral - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + marks=[pytest.mark.core_model], + ), + pytest.param( + "Qwen/Qwen1.5-MoE-A2.7B-Chat", + marks=[pytest.mark.cpu_model], ) ]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index b8b17524cf07..4b5c77d16fe2 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import pytest @@ -28,8 +27,10 @@ def v1(run_with_both_engines): # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]), - pytest.param("intfloat/e5-mistral-7b-instruct", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + pytest.param( + "intfloat/e5-mistral-7b-instruct", + # CPU v1 doesn't support sliding window + marks=[pytest.mark.core_model]), # the qwen models interfere with each other (see PR # https://github.com/vllm-project/vllm/pull/18720). # To avoid this problem, for now we skip v0 since it will be @@ -37,11 +38,13 @@ def v1(run_with_both_engines): pytest.param("ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]), # [Encoder-only] - pytest.param("BAAI/bge-base-en-v1.5", - marks=[ - pytest.mark.core_model, pytest.mark.cpu_model, - pytest.mark.skip_v1 - ]), + pytest.param( + "BAAI/bge-base-en-v1.5", + marks=[ + # CPU only supports V1 + pytest.mark.core_model, + pytest.mark.skip_v1 + ]), pytest.param("sentence-transformers/all-MiniLM-L12-v2", marks=[pytest.mark.skip_v1]), pytest.param("intfloat/multilingual-e5-small", @@ -60,10 +63,6 @@ def test_models( model, monkeypatch, ) -> None: - if model == "intfloat/e5-mistral-7b-instruct" and current_platform.is_cpu( - ) and os.environ.get("VLLM_USE_V1", "0") == "1": - pytest.skip("CPU V1 doesn't support sliding window") - if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm(): # ROCm Triton FA does not currently support sliding window attention # switch to use ROCm CK FA backend diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py index ec3d25ee22a9..3b7fab3ba5c9 100644 --- a/tests/models/language/pooling/test_reward.py +++ b/tests/models/language/pooling/test_reward.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os + import pytest import torch import torch.nn.functional as F @@ -84,6 +86,9 @@ def test_prm_models( dtype: str, monkeypatch, ) -> None: + if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0": + pytest.skip("CPU only supports V1") + if current_platform.is_rocm(): # ROCm Triton FA does not currently support sliding window attention # switch to use ROCm CK FA backend diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 3646ad6c481b..db7e50eff72b 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -45,7 +45,8 @@ def use_v0_only(monkeypatch): """ This module relies on V0 internals, so set VLLM_USE_V1=0. """ - monkeypatch.setenv('VLLM_USE_V1', '0') + if not current_platform.is_cpu(): + monkeypatch.setenv('VLLM_USE_V1', '0') @pytest.mark.parametrize( diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py deleted file mode 100644 index a490aa397991..000000000000 --- a/vllm/attention/backends/torch_sdpa.py +++ /dev/null @@ -1,546 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" Attention layer with torch scaled_dot_product_attention - and PagedAttention.""" -from dataclasses import dataclass -from typing import Any, Dict, List, Optional - -import torch -from torch.nn.functional import scaled_dot_product_attention - -# yapf conflicts with isort for this block -# yapf: disable -from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer, - AttentionMetadata, AttentionType, - is_quantized_kv_cache) -# yapf: enable -from vllm.attention.ops.ipex_attn import PagedAttention, _use_ipex -from vllm.attention.ops.paged_attn import PagedAttentionMetadata -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -@dataclass -class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): - """Metadata for TorchSDPABackend. - """ - # Currently, input sequences can only contain all prompts - # or all decoding. True if all sequences are prompts. - chunked_prefill: bool - seq_lens: Optional[List[int]] = None # For non-chunked prefill - - # For chunked prefill only - max_query_len: Optional[int] = None - max_kv_len: Optional[int] = None - prefill_query_start_loc: Optional[torch.Tensor] = None - kv_start_loc: Optional[torch.Tensor] = None - prefill_block_tables: Optional[torch.Tensor] = None - - # For V1 logits index only - query_start_loc: Optional[torch.Tensor] = None - - # Begin encoder attn & enc/dec cross-attn fields... - # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - def __post_init__(self): - # Set during the execution of the first attention op. - # It is a list because it is needed to set per prompt - # when alibi slopes is used. It is because of the limitation - # from xformer API. - # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[List[torch.Tensor]] = None - self.encoder_attn_bias: Optional[List[torch.Tensor]] = None - self.cross_attn_bias: Optional[List[torch.Tensor]] = None - - @property - def is_all_encoder_attn_metadata_set(self): - ''' - All attention metadata required for encoder attention is set. - ''' - return ((self.encoder_seq_lens is not None) - and (self.encoder_seq_lens_tensor is not None) - and (self.max_encoder_seq_len is not None)) - - @property - def is_all_cross_attn_metadata_set(self): - ''' - All attention metadata required for enc/dec cross-attention is set. - - Superset of encoder attention required metadata. - ''' - return (self.is_all_encoder_attn_metadata_set - and (self.cross_slot_mapping is not None) - and (self.cross_block_tables is not None)) - - @property - def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]: - if self.num_prefill_tokens == 0: - return None - return self - - @property - def decode_metadata(self) -> Optional["TorchSDPAMetadata"]: - if self.num_decode_tokens == 0: - return None - return self - - def get_seq_lens( - self, - attn_type: str, - ): - ''' - Extract appropriate sequence lengths from attention metadata - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - * Appropriate sequence lengths tensor for query - * Appropriate sequence lengths tensor for key & value - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - seq_lens_q = self.seq_lens - seq_lens_kv = self.seq_lens - elif attn_type == AttentionType.ENCODER: - seq_lens_q = self.encoder_seq_lens - seq_lens_kv = self.encoder_seq_lens - elif attn_type == AttentionType.ENCODER_DECODER: - seq_lens_q = self.seq_lens - seq_lens_kv = self.encoder_seq_lens - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - return seq_lens_q, seq_lens_kv - - def get_attn_bias( - self, - attn_type: str, - ) -> Optional[List[torch.Tensor]]: - ''' - Extract appropriate attention bias from attention metadata - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - * Appropriate attention bias value given the attention type - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - return self.attn_bias - elif attn_type == AttentionType.ENCODER: - return self.encoder_attn_bias - elif attn_type == AttentionType.ENCODER_DECODER: - return self.cross_attn_bias - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - def set_attn_bias( - self, - attn_bias: List[torch.Tensor], - attn_type: str, - ) -> None: - ''' - Update appropriate attention bias field of attention metadata, - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_bias: The desired attention bias value - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - self.attn_bias = attn_bias - elif attn_type == AttentionType.ENCODER: - self.encoder_attn_bias = attn_bias - elif attn_type == AttentionType.ENCODER_DECODER: - self.cross_attn_bias = attn_bias - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - def get_seq_len_block_table_args( - self, - attn_type: str, - ) -> tuple: - ''' - The particular choice of sequence-length- and block-table-related - attributes which should be extracted from attn_metadata is dependent - on the type of attention operation. - - Decoder attn -> select entirely decoder self-attention-related fields - Encoder/decoder cross-attn -> select encoder sequence lengths & - cross-attn block-tables fields - Encoder attn -> select encoder sequence lengths fields & no block tables - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * is_prompt: True if prefill, False otherwise - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - - * Appropriate sequence-lengths tensor - * Appropriate max sequence-length scalar - * Appropriate block tables (or None) - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - # Decoder self-attention - # Choose max_seq_len based on whether we are in prompt_run - return (self.seq_lens_tensor, self.max_decode_seq_len, - self.block_tables) - elif attn_type == AttentionType.ENCODER_DECODER: - # Enc/dec cross-attention KVs match encoder sequence length; - # cross-attention utilizes special "cross" block tables - return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, - self.cross_block_tables) - elif attn_type == AttentionType.ENCODER: - # No block tables associated with encoder attention - return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, - None) - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - -class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0.") - if blocksparse_params is not None: - raise ValueError( - "Torch SPDA does not support block-sparse attention.") - if logits_soft_cap is not None: - logger.warning_once("Torch SPDA does not support logits soft cap. " - "Outputs may be slightly off.") - if use_irope: - logger.warning_once( - "Using irope in Torch SPDA is not supported yet, it will fall" - " back to global attention for long context.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = sliding_window - self.kv_cache_dtype = kv_cache_dtype - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.need_mask = (self.alibi_slopes is not None - or self.sliding_window is not None) - - supported_head_sizes = PagedAttention.get_supported_head_sizes() - if head_size not in supported_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {supported_head_sizes}.") - - if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex: - raise NotImplementedError( - "Torch SDPA backend FP8 KV cache requires " - "intel_extension_for_pytorch support.") - self.attn_type = attn_type - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: TorchSDPAMetadata, # type: ignore - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with torch SDPA and PagedAttention. - - Args: - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - if output_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for TorchSDPABackendImpl") - - # For warming-up - if attn_metadata is None: - return query - - attn_type = self.attn_type - if (attn_type == AttentionType.ENCODER - and (not attn_metadata.is_all_encoder_attn_metadata_set)): - raise AttributeError("Encoder attention requires setting " - "encoder metadata attributes.") - elif (attn_type == AttentionType.ENCODER_DECODER - and (not attn_metadata.is_all_cross_attn_metadata_set)): - raise AttributeError("Encoder/decoder cross-attention " - "requires setting cross-attention " - "metadata attributes.") - - # Reshape the query, key, and value tensors. - query = query.view(-1, self.num_heads, self.head_size) - if key is not None: - assert value is not None - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - else: - assert value is None - - if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0): - # KV-cache during decoder-self- or - # encoder-decoder-cross-attention, but not - # during encoder attention. - # - # Even if there are no new key/value pairs to cache, - # we still need to break out key_cache and value_cache - # i.e. for later use by paged attention - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - - if (key is not None) and (value is not None): - if attn_type == AttentionType.ENCODER_DECODER: - # Update cross-attention KV cache (prefill-only) - # During cross-attention decode, key & value will be None, - # preventing this IF-statement branch from running - updated_slot_mapping = attn_metadata.cross_slot_mapping - else: - # Update self-attention KV cache (prefill/decode) - updated_slot_mapping = attn_metadata.slot_mapping - - PagedAttention.write_to_paged_cache( - key, value, key_cache, value_cache, updated_slot_mapping, - self.kv_cache_dtype, layer._k_scale, layer._v_scale) - - if attn_type != AttentionType.ENCODER: - # Decoder self-attention supports chunked prefill. - # Encoder/decoder cross-attention requires no chunked - # prefill (100% prefill or 100% decode tokens, no mix) - num_prefill_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - else: - # Encoder attention - chunked prefill is not applicable; - # derive token-count from query shape & and treat them - # as 100% prefill tokens - assert attn_metadata.num_encoder_tokens is not None - num_prefill_tokens = attn_metadata.num_encoder_tokens - num_decode_tokens = 0 - - if attn_type == AttentionType.DECODER: - # Only enforce this shape-constraint for decoder - # self-attention - assert key.shape[0] == num_prefill_tokens + num_decode_tokens - assert value.shape[0] == num_prefill_tokens + num_decode_tokens - - output = torch.empty_like(query) - if prefill_meta := attn_metadata.prefill_metadata: - if not prefill_meta.prefill_metadata.chunked_prefill: # type: ignore - assert attn_metadata.seq_lens is not None - self._run_sdpa_forward(output, - query, - key, - value, - prefill_meta, - attn_type=attn_type) - else: - # prefix-enabled attention - assert not self.need_mask - import intel_extension_for_pytorch.llm.modules as ipex_modules - output = torch.empty_like(query) - ipex_modules.PagedAttention.flash_attn_varlen_func( - output[:prefill_meta.num_prefill_tokens, :, :], - query[:prefill_meta.num_prefill_tokens, :, :], - key_cache, - value_cache, - prefill_meta.prefill_query_start_loc, - prefill_meta.kv_start_loc, - prefill_meta.max_query_len, - prefill_meta.max_kv_len, - self.scale, - True, - prefill_meta.prefill_block_tables, - self.alibi_slopes, - ) - - if decode_meta := attn_metadata.decode_metadata: - assert attn_type != AttentionType.ENCODER_ONLY, ( - "Encoder-only models should not have decode metadata.") - # Decoding run. - ( - seq_lens_arg, - max_seq_len_arg, - block_tables_arg, - ) = decode_meta.get_seq_len_block_table_args(attn_type) - - PagedAttention.forward_decode( - output[attn_metadata.num_prefill_tokens:, :, :], - query[attn_metadata.num_prefill_tokens:, :, :], - key_cache, - value_cache, - block_tables_arg, - seq_lens_arg, - max_seq_len_arg, - self.kv_cache_dtype, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - layer._k_scale, - layer._v_scale, - ) - - # Reshape the output tensor. - return output.view(-1, self.num_heads * self.head_size) - - def _run_sdpa_forward( - self, - output: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_metadata: TorchSDPAMetadata, - attn_type: str = AttentionType.DECODER, - ) -> None: - if self.num_kv_heads != self.num_heads: - key = key.repeat_interleave(self.num_queries_per_kv, dim=1) - value = value.repeat_interleave(self.num_queries_per_kv, dim=1) - - attn_masks = attn_metadata.get_attn_bias(attn_type) - if attn_masks is None: - if self.alibi_slopes is not None: - attn_masks = _make_alibi_bias( - self.alibi_slopes, query.dtype, - attn_metadata.seq_lens) # type: ignore - elif self.sliding_window is not None: - assert attn_metadata.seq_lens is not None - attn_masks = _make_sliding_window_bias( - attn_metadata.seq_lens, self.sliding_window, - query.dtype) # type: ignore - else: - seq_lens, _ = attn_metadata.get_seq_lens(attn_type) - attn_masks = [None] * len(seq_lens) - attn_metadata.set_attn_bias(attn_masks, attn_type) - - query = query.movedim(0, query.dim() - 2) - key = key.movedim(0, key.dim() - 2) - value = value.movedim(0, value.dim() - 2) - - causal_attn = (attn_type == AttentionType.DECODER) - - seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type) - start_q, start_kv = 0, 0 - for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv, - attn_masks): - end_q = start_q + seq_len_q - end_kv = start_kv + seq_len_kv - sub_out = scaled_dot_product_attention( - query[None, :, start_q:end_q, :], - key[None, :, start_kv:end_kv, :], - value[None, :, start_kv:end_kv, :], - attn_mask=mask, - dropout_p=0.0, - is_causal=causal_attn and mask is None, - scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0) - output[start_q:end_q, :, :] = sub_out - start_q, start_kv = end_q, end_kv - - -def _make_alibi_bias( - alibi_slopes: torch.Tensor, - dtype: torch.dtype, - seq_lens: List[int], -) -> List[torch.Tensor]: - attn_biases: List[torch.Tensor] = [] - for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(seq_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - bias = bias[None, :] - bias[:, None] - - num_heads = alibi_slopes.shape[0] - bias = bias[None, :].repeat((num_heads, 1, 1)) - bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0) - inf_mask = torch.empty( - (1, seq_len, seq_len), - dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1) - attn_biases.append((bias + inf_mask).to(dtype)) - - return attn_biases - - -def _make_sliding_window_bias( - seq_lens: List[int], - window_size: Optional[int], - dtype: torch.dtype, -) -> List[torch.Tensor]: - attn_biases: List[torch.Tensor] = [] - for seq_len in seq_lens: - tensor = torch.full( - (1, seq_len, seq_len), - dtype=dtype, - fill_value=1, - ) - shift = 0 - mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore - if window_size is not None: - mask = torch.triu(mask, diagonal=shift - window_size + 1) - mask = torch.log(mask) - attn_biases.append(mask.to(dtype)) - - return attn_biases diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py deleted file mode 100644 index 891975498916..000000000000 --- a/vllm/attention/ops/ipex_attn.py +++ /dev/null @@ -1,195 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Tuple - -try: - import intel_extension_for_pytorch.llm.modules as ipex_modules - _use_ipex = True -# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813 -except (ImportError, AttributeError): - _use_ipex = False - -import torch - -from vllm import _custom_ops as ops - - -class _PagedAttention: - - @staticmethod - def get_supported_head_sizes() -> List[int]: - return [32, 64, 80, 96, 112, 128, 192, 256] - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - *args, - ) -> Tuple[int, ...]: - return 2, num_blocks, block_size * num_kv_heads * head_size - - @staticmethod - def split_kv_cache( - kv_cache: torch.Tensor, - num_kv_heads: int, - head_size: int, - *args, - ) -> Tuple[torch.Tensor, torch.Tensor]: - x = 16 // kv_cache.element_size() - num_blocks = kv_cache.shape[1] - - key_cache = kv_cache[0] - key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, - -1, x) - value_cache = kv_cache[1] - value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) - return key_cache, value_cache - - @staticmethod - def write_to_paged_cache( - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - slot_mapping: torch.Tensor, - kv_cache_dtype: str, - k_scale: torch.Tensor, - v_scale: torch.Tensor, - *args, - ) -> None: - ops.reshape_and_cache( - key, - value, - key_cache, - value_cache, - slot_mapping.flatten(), - kv_cache_dtype, - k_scale, - v_scale, - ) - - @staticmethod - def forward_decode( - output: torch.Tensor, - query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_tables: torch.Tensor, - context_lens: torch.Tensor, - max_context_len: int, - kv_cache_dtype: str, - num_kv_heads: int, - scale: float, - alibi_slopes: Optional[torch.Tensor], - k_scale: torch.Tensor, - v_scale: torch.Tensor, - *args, - ) -> None: - tp_rank: int = 0 - blocksparse_local_blocks: int = 0 - blocksparse_vert_stride: int = 0 - blocksparse_block_size: int = 64 - blocksparse_head_sliding_step: int = 0 - block_size = value_cache.shape[3] - - ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - context_lens, - block_size, - max_context_len, - alibi_slopes, - kv_cache_dtype, - k_scale, - v_scale, - tp_rank, - blocksparse_local_blocks, - blocksparse_vert_stride, - blocksparse_block_size, - blocksparse_head_sliding_step, - ) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - *args, - ) -> None: - key_caches = [kv_cache[0] for kv_cache in kv_caches] - value_caches = [kv_cache[1] for kv_cache in kv_caches] - ops.copy_blocks(key_caches, value_caches, src_to_dists) - - -class _IPEXPagedAttention(_PagedAttention): - - @staticmethod - def split_kv_cache( - kv_cache: torch.Tensor, - num_kv_heads: int, - head_size: int, - *args, - ) -> Tuple[torch.Tensor, torch.Tensor]: - num_blocks = kv_cache.shape[1] - - key_cache = kv_cache[0] - key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size) - value_cache = kv_cache[1] - value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size) - return key_cache, value_cache - - @staticmethod - def write_to_paged_cache( - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - slot_mapping: torch.Tensor, - kv_cache_dtype: str, - k_scale: torch.Tensor, - v_scale: torch.Tensor, - *args, - ) -> None: - ipex_modules.PagedAttention.reshape_and_cache( - key, value, key_cache, value_cache, - slot_mapping.flatten().int()) - - @staticmethod - def forward_decode( - output: torch.Tensor, - query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_tables: torch.Tensor, - context_lens: torch.Tensor, - max_context_len: int, - kv_cache_dtype: str, - num_kv_heads: int, - scale: float, - alibi_slopes: Optional[torch.Tensor], - k_scale: torch.Tensor, - v_scale: torch.Tensor, - *args, - ) -> None: - block_size = value_cache.shape[2] - head_mapping = torch.arange( - 0, - num_kv_heads, - device="cpu", - dtype=torch.int32, - ).view(num_kv_heads, - 1).repeat_interleave(query.size(1) // num_kv_heads).flatten() - ipex_modules.PagedAttention.single_query_cached_kv_attention( - output, query.contiguous(), key_cache, value_cache, head_mapping, - scale, block_tables, context_lens, block_size, max_context_len, - alibi_slopes) - - -PagedAttention = _IPEXPagedAttention if _use_ipex else _PagedAttention diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 284f09236131..873d29aaf312 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1395,6 +1395,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: # Only Fp16 and Bf16 dtypes since we only support FA. V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16] + if current_platform.is_cpu(): + V1_SUPPORTED_DTYPES.append(torch.float32) if model_config.dtype not in V1_SUPPORTED_DTYPES: _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}", recommend_to_remove=False) diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 1c4604cc27e4..72b2d09fa1f1 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -1,12 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 +from dataclasses import dataclass +from typing import Any, Optional + import numpy as np import torch +from torch.nn.functional import scaled_dot_product_attention -from vllm.attention.backends.abstract import AttentionMetadata -from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl, - TorchSDPAMetadata) +from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer, + AttentionMetadata, AttentionType, + is_quantized_kv_cache) from vllm.attention.backends.utils import CommonAttentionState -from vllm.attention.ops.ipex_attn import PagedAttention +from vllm.logger import init_logger +from vllm.multimodal import MultiModalPlaceholderMap from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, CommonAttentionMetadata) from vllm.v1.core.sched.output import SchedulerOutput @@ -15,6 +20,17 @@ from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.gpu_input_batch import InputBatch +try: + import intel_extension_for_pytorch.llm.modules as ipex_modules + _use_ipex = True +# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813 +except (ImportError, AttributeError): + _use_ipex = False + +from vllm import _custom_ops as ops + +logger = init_logger(__name__) + class TorchSDPABackend: accept_output_buffer: bool = False @@ -46,14 +62,265 @@ def get_kv_cache_shape( num_kv_heads: int, head_size: int, ) -> tuple[int, ...]: - return PagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) + return _get_paged_attn_impl().get_kv_cache_shape( # type: ignore + num_blocks, block_size, num_kv_heads, head_size) @staticmethod def use_cascade_attention(*args, **kwargs) -> bool: return False +@dataclass +class TorchSDPAMetadata(AttentionMetadata): + """Attention metadata for prefill and decode batched together.""" + # Total number of prefill requests. + num_prefills: int + # Number of prefill tokens. + num_prefill_tokens: int + # Number of decode tokens. Note that it is equivalent to the number of + # decode requests. + num_decode_tokens: int + # (num_tokens,). The indices of the token slots that input tokens will be + # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size + # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot + # in block 0, and 1st slot in block 1, respectively. + slot_mapping: torch.Tensor + + # The index maps that relate multi-modal embeddings to the corresponding + # placeholders. + # + # N.B. These aren't really related to attention and don't belong on this + # type -- this is just a temporary solution to make them available to + # `model_executable`. + multi_modal_placeholder_index_maps: Optional[dict[ + str, MultiModalPlaceholderMap.IndexMap]] + + # Enable/disable KV scales calculation. This is so that we can disable the + # calculation until after prefill and cuda graph capture. + enable_kv_scales_calculation: bool + """Metadata for PagedAttention.""" + # (batch_size,). The length of sequences (entire tokens seen so far) per + # sequence. + seq_lens_tensor: Optional[torch.Tensor] + # Maximum sequence length in the batch. 0 if it is prefill-only batch. + max_decode_seq_len: int + # (batch_size, max_blocks_per_seq). + # Block addresses per sequence. (Seq id -> list of physical block) + # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks + # in the kv cache. Each block can contain up to block_size tokens. + # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph + # captured. + block_tables: Optional[torch.Tensor] + """Metadata for TorchSDPABackend. + """ + # Currently, input sequences can only contain all prompts + # or all decoding. True if all sequences are prompts. + chunked_prefill: bool + seq_lens: Optional[list[int]] = None # For non-chunked prefill + + # For chunked prefill only + max_query_len: Optional[int] = None + max_kv_len: Optional[int] = None + prefill_query_start_loc: Optional[torch.Tensor] = None + kv_start_loc: Optional[torch.Tensor] = None + prefill_block_tables: Optional[torch.Tensor] = None + + # For V1 logits index only + query_start_loc: Optional[torch.Tensor] = None + + # Begin encoder attn & enc/dec cross-attn fields... + # Encoder sequence lengths representation + encoder_seq_lens: Optional[list[int]] = None + encoder_seq_lens_tensor: Optional[torch.Tensor] = None + + # Maximum sequence length among encoder sequences + max_encoder_seq_len: Optional[int] = None + + # Number of tokens input to encoder + num_encoder_tokens: Optional[int] = None + + # Cross-attention memory-mapping data structures: slot mapping + # and block tables + cross_slot_mapping: Optional[torch.Tensor] = None + cross_block_tables: Optional[torch.Tensor] = None + + def __post_init__(self): + # Set during the execution of the first attention op. + # It is a list because it is needed to set per prompt + # when alibi slopes is used. It is because of the limitation + # from xformer API. + # will not appear in the __repr__ and __init__ + self.attn_bias: Optional[list[torch.Tensor]] = None + self.encoder_attn_bias: Optional[list[torch.Tensor]] = None + self.cross_attn_bias: Optional[list[torch.Tensor]] = None + + @property + def is_all_encoder_attn_metadata_set(self): + ''' + All attention metadata required for encoder attention is set. + ''' + return ((self.encoder_seq_lens is not None) + and (self.encoder_seq_lens_tensor is not None) + and (self.max_encoder_seq_len is not None)) + + @property + def is_all_cross_attn_metadata_set(self): + ''' + All attention metadata required for enc/dec cross-attention is set. + + Superset of encoder attention required metadata. + ''' + return (self.is_all_encoder_attn_metadata_set + and (self.cross_slot_mapping is not None) + and (self.cross_block_tables is not None)) + + @property + def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]: + if self.num_prefill_tokens == 0: + return None + return self + + @property + def decode_metadata(self) -> Optional["TorchSDPAMetadata"]: + if self.num_decode_tokens == 0: + return None + return self + + def get_seq_lens( + self, + attn_type: str, + ): + ''' + Extract appropriate sequence lengths from attention metadata + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + * Appropriate sequence lengths tensor for query + * Appropriate sequence lengths tensor for key & value + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + seq_lens_q = self.seq_lens + seq_lens_kv = self.seq_lens + elif attn_type == AttentionType.ENCODER: + seq_lens_q = self.encoder_seq_lens + seq_lens_kv = self.encoder_seq_lens + elif attn_type == AttentionType.ENCODER_DECODER: + seq_lens_q = self.seq_lens + seq_lens_kv = self.encoder_seq_lens + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + return seq_lens_q, seq_lens_kv + + def get_attn_bias( + self, + attn_type: str, + ) -> Optional[list[torch.Tensor]]: + ''' + Extract appropriate attention bias from attention metadata + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + * Appropriate attention bias value given the attention type + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + return self.attn_bias + elif attn_type == AttentionType.ENCODER: + return self.encoder_attn_bias + elif attn_type == AttentionType.ENCODER_DECODER: + return self.cross_attn_bias + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + def set_attn_bias( + self, + attn_bias: list[torch.Tensor], + attn_type: str, + ) -> None: + ''' + Update appropriate attention bias field of attention metadata, + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_bias: The desired attention bias value + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + self.attn_bias = attn_bias + elif attn_type == AttentionType.ENCODER: + self.encoder_attn_bias = attn_bias + elif attn_type == AttentionType.ENCODER_DECODER: + self.cross_attn_bias = attn_bias + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + def get_seq_len_block_table_args( + self, + attn_type: str, + ) -> tuple: + ''' + The particular choice of sequence-length- and block-table-related + attributes which should be extracted from attn_metadata is dependent + on the type of attention operation. + + Decoder attn -> select entirely decoder self-attention-related fields + Encoder/decoder cross-attn -> select encoder sequence lengths & + cross-attn block-tables fields + Encoder attn -> select encoder sequence lengths fields & no block tables + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * is_prompt: True if prefill, False otherwise + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + + * Appropriate sequence-lengths tensor + * Appropriate max sequence-length scalar + * Appropriate block tables (or None) + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + # Decoder self-attention + # Choose max_seq_len based on whether we are in prompt_run + return (self.seq_lens_tensor, self.max_decode_seq_len, + self.block_tables) + elif attn_type == AttentionType.ENCODER_DECODER: + # Enc/dec cross-attention KVs match encoder sequence length; + # cross-attention utilizes special "cross" block tables + return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, + self.cross_block_tables) + elif attn_type == AttentionType.ENCODER: + # No block tables associated with encoder attention + return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, + None) + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]): def __init__(self, runner: CPUModelRunner, kv_cache_spec: AttentionSpec, @@ -165,3 +432,503 @@ def build(self, common_prefix_len: int, ) return attn_metadata + + +class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + use_irope: bool = False, + ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") + if blocksparse_params is not None: + raise ValueError( + "Torch SPDA does not support block-sparse attention.") + if logits_soft_cap is not None: + logger.warning_once("Torch SPDA does not support logits soft cap. " + "Outputs may be slightly off.") + if use_irope: + logger.warning_once( + "Using irope in Torch SPDA is not supported yet, it will fall" + " back to global attention for long context.") + self.paged_attn_impl = _get_paged_attn_impl() + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + self.sliding_window = sliding_window + self.kv_cache_dtype = kv_cache_dtype + + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.need_mask = (self.alibi_slopes is not None + or self.sliding_window is not None) + + supported_head_sizes = self.paged_attn_impl \ + .get_supported_head_sizes() # type: ignore + if head_size not in supported_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {supported_head_sizes}.") + + if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex: + raise NotImplementedError( + "Torch SDPA backend FP8 KV cache requires " + "intel_extension_for_pytorch support.") + self.attn_type = attn_type + + def forward( + self, + layer: AttentionLayer, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: TorchSDPAMetadata, # type: ignore + output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Forward pass with torch SDPA and PagedAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + NOTE: kv_cache will be an empty tensor with shape [0] + for profiling run. + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for TorchSDPABackendImpl") + + # For warming-up + if attn_metadata is None: + return query + + attn_type = self.attn_type + if (attn_type == AttentionType.ENCODER + and (not attn_metadata.is_all_encoder_attn_metadata_set)): + raise AttributeError("Encoder attention requires setting " + "encoder metadata attributes.") + elif (attn_type == AttentionType.ENCODER_DECODER + and (not attn_metadata.is_all_cross_attn_metadata_set)): + raise AttributeError("Encoder/decoder cross-attention " + "requires setting cross-attention " + "metadata attributes.") + + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + if key is not None: + assert value is not None + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + else: + assert value is None + + if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0): + # KV-cache during decoder-self- or + # encoder-decoder-cross-attention, but not + # during encoder attention. + # + # Even if there are no new key/value pairs to cache, + # we still need to break out key_cache and value_cache + # i.e. for later use by paged attention + key_cache, value_cache = self.paged_attn_impl \ + .split_kv_cache( # type: ignore + kv_cache, self.num_kv_heads, self.head_size) + + if (key is not None) and (value is not None): + if attn_type == AttentionType.ENCODER_DECODER: + # Update cross-attention KV cache (prefill-only) + # During cross-attention decode, key & value will be None, + # preventing this IF-statement branch from running + updated_slot_mapping = attn_metadata.cross_slot_mapping + else: + # Update self-attention KV cache (prefill/decode) + updated_slot_mapping = attn_metadata.slot_mapping + + self.paged_attn_impl.write_to_paged_cache( # type: ignore + key, value, key_cache, value_cache, updated_slot_mapping, + self.kv_cache_dtype, layer._k_scale, layer._v_scale) + + if attn_type != AttentionType.ENCODER: + # Decoder self-attention supports chunked prefill. + # Encoder/decoder cross-attention requires no chunked + # prefill (100% prefill or 100% decode tokens, no mix) + num_prefill_tokens = attn_metadata.num_prefill_tokens + num_decode_tokens = attn_metadata.num_decode_tokens + else: + # Encoder attention - chunked prefill is not applicable; + # derive token-count from query shape & and treat them + # as 100% prefill tokens + assert attn_metadata.num_encoder_tokens is not None + num_prefill_tokens = attn_metadata.num_encoder_tokens + num_decode_tokens = 0 + + if attn_type == AttentionType.DECODER: + # Only enforce this shape-constraint for decoder + # self-attention + assert key.shape[0] == num_prefill_tokens + num_decode_tokens + assert value.shape[0] == num_prefill_tokens + num_decode_tokens + + output = torch.empty_like(query) + if prefill_meta := attn_metadata.prefill_metadata: + if not prefill_meta.prefill_metadata.chunked_prefill: # type: ignore + assert attn_metadata.seq_lens is not None + self._run_sdpa_forward(output, + query, + key, + value, + prefill_meta, + attn_type=attn_type) + else: + # prefix-enabled attention + assert not self.need_mask + import intel_extension_for_pytorch.llm.modules as ipex_modules + output = torch.empty_like(query) + ipex_modules.PagedAttention.flash_attn_varlen_func( + output[:prefill_meta.num_prefill_tokens, :, :], + query[:prefill_meta.num_prefill_tokens, :, :], + key_cache, + value_cache, + prefill_meta.prefill_query_start_loc, + prefill_meta.kv_start_loc, + prefill_meta.max_query_len, + prefill_meta.max_kv_len, + self.scale, + True, + prefill_meta.prefill_block_tables, + self.alibi_slopes, + ) + + if decode_meta := attn_metadata.decode_metadata: + assert attn_type != AttentionType.ENCODER_ONLY, ( + "Encoder-only models should not have decode metadata.") + # Decoding run. + ( + seq_lens_arg, + max_seq_len_arg, + block_tables_arg, + ) = decode_meta.get_seq_len_block_table_args(attn_type) + + self.paged_attn_impl.forward_decode( # type: ignore + output[attn_metadata.num_prefill_tokens:, :, :], + query[attn_metadata.num_prefill_tokens:, :, :], + key_cache, + value_cache, + block_tables_arg, + seq_lens_arg, + max_seq_len_arg, + self.kv_cache_dtype, + self.num_kv_heads, + self.scale, + self.alibi_slopes, + layer._k_scale, + layer._v_scale, + ) + + # Reshape the output tensor. + return output.view(-1, self.num_heads * self.head_size) + + def _run_sdpa_forward( + self, + output: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_metadata: TorchSDPAMetadata, + attn_type: str = AttentionType.DECODER, + ) -> None: + if self.num_kv_heads != self.num_heads: + key = key.repeat_interleave(self.num_queries_per_kv, dim=1) + value = value.repeat_interleave(self.num_queries_per_kv, dim=1) + + attn_masks = attn_metadata.get_attn_bias(attn_type) + if attn_masks is None: + if self.alibi_slopes is not None: + attn_masks = _make_alibi_bias( + self.alibi_slopes, query.dtype, + attn_metadata.seq_lens) # type: ignore + elif self.sliding_window is not None: + assert attn_metadata.seq_lens is not None + attn_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, self.sliding_window, + query.dtype) # type: ignore + else: + seq_lens, _ = attn_metadata.get_seq_lens(attn_type) + attn_masks = [None] * len(seq_lens) + attn_metadata.set_attn_bias(attn_masks, attn_type) + + query = query.movedim(0, query.dim() - 2) + key = key.movedim(0, key.dim() - 2) + value = value.movedim(0, value.dim() - 2) + + causal_attn = (attn_type == AttentionType.DECODER) + + seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type) + start_q, start_kv = 0, 0 + for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv, + attn_masks): + end_q = start_q + seq_len_q + end_kv = start_kv + seq_len_kv + sub_out = scaled_dot_product_attention( + query[None, :, start_q:end_q, :], + key[None, :, start_kv:end_kv, :], + value[None, :, start_kv:end_kv, :], + attn_mask=mask, + dropout_p=0.0, + is_causal=causal_attn and mask is None, + scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0) + output[start_q:end_q, :, :] = sub_out + start_q, start_kv = end_q, end_kv + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + dtype: torch.dtype, + seq_lens: list[int], +) -> list[torch.Tensor]: + attn_biases: list[torch.Tensor] = [] + for seq_len in seq_lens: + bias = torch.arange(seq_len, dtype=dtype) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(seq_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + bias = bias[None, :] - bias[:, None] + + num_heads = alibi_slopes.shape[0] + bias = bias[None, :].repeat((num_heads, 1, 1)) + bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0) + inf_mask = torch.empty( + (1, seq_len, seq_len), + dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1) + attn_biases.append((bias + inf_mask).to(dtype)) + + return attn_biases + + +def _make_sliding_window_bias( + seq_lens: list[int], + window_size: Optional[int], + dtype: torch.dtype, +) -> list[torch.Tensor]: + attn_biases: list[torch.Tensor] = [] + for seq_len in seq_lens: + tensor = torch.full( + (1, seq_len, seq_len), + dtype=dtype, + fill_value=1, + ) + shift = 0 + mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore + if window_size is not None: + mask = torch.triu(mask, diagonal=shift - window_size + 1) + mask = torch.log(mask) + attn_biases.append(mask.to(dtype)) + + return attn_biases + + +class _PagedAttention: + + @staticmethod + def get_supported_head_sizes() -> list[int]: + return [32, 64, 80, 96, 112, 128, 192, 256] + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + *args, + ) -> tuple[int, ...]: + return 2, num_blocks, block_size * num_kv_heads * head_size + + @staticmethod + def split_kv_cache( + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + *args, + ) -> tuple[torch.Tensor, torch.Tensor]: + x = 16 // kv_cache.element_size() + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, + -1, x) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) + return key_cache, value_cache + + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + *args, + ) -> None: + ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + slot_mapping.flatten(), + kv_cache_dtype, + k_scale, + v_scale, + ) + + @staticmethod + def forward_decode( + output: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + max_context_len: int, + kv_cache_dtype: str, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + k_scale: torch.Tensor, + v_scale: torch.Tensor, + *args, + ) -> None: + tp_rank: int = 0 + blocksparse_local_blocks: int = 0 + blocksparse_vert_stride: int = 0 + blocksparse_block_size: int = 64 + blocksparse_head_sliding_step: int = 0 + block_size = value_cache.shape[3] + + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + + @staticmethod + def copy_blocks( + kv_caches: list[torch.Tensor], + src_to_dists: torch.Tensor, + *args, + ) -> None: + key_caches = [kv_cache[0] for kv_cache in kv_caches] + value_caches = [kv_cache[1] for kv_cache in kv_caches] + ops.copy_blocks(key_caches, value_caches, src_to_dists) + + +class _IPEXPagedAttention(_PagedAttention): + + @staticmethod + def split_kv_cache( + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + *args, + ) -> tuple[torch.Tensor, torch.Tensor]: + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size) + return key_cache, value_cache + + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + *args, + ) -> None: + ipex_modules.PagedAttention.reshape_and_cache( + key, value, key_cache, value_cache, + slot_mapping.flatten().int()) + + @staticmethod + def forward_decode( + output: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + max_context_len: int, + kv_cache_dtype: str, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + k_scale: torch.Tensor, + v_scale: torch.Tensor, + *args, + ) -> None: + block_size = value_cache.shape[2] + head_mapping = torch.arange( + 0, + num_kv_heads, + device="cpu", + dtype=torch.int32, + ).view(num_kv_heads, + 1).repeat_interleave(query.size(1) // num_kv_heads).flatten() + ipex_modules.PagedAttention.single_query_cached_kv_attention( + output, query.contiguous(), key_cache, value_cache, head_mapping, + scale, block_tables, context_lens, block_size, max_context_len, + alibi_slopes) + + +def _get_paged_attn_impl() -> type: + if _use_ipex: + return _IPEXPagedAttention + else: + return _PagedAttention From fe5a9af2c5b490785a0c5fa924f787d304864305 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 6 Jul 2025 00:47:57 -0700 Subject: [PATCH 12/13] merge Signed-off-by: Woosuk Kwon --- .buildkite/release-pipeline.yaml | 2 +- .../scripts/hardware_ci/run-cpu-test.sh | 24 +- .buildkite/test-pipeline.yaml | 12 +- .github/CODEOWNERS | 2 +- .github/workflows/lint-and-deploy.yaml | 2 +- CMakeLists.txt | 22 +- benchmarks/kernels/bench_fp8_gemm.py | 1 + ..._warpspecialized_fp8_blockwise_scaling.hpp | 1 - .../compressed_tensors/int8_quant_kernels.cu | 16 +- .../cutlass_w8a8/c3x/scaled_mm.cuh | 3 +- .../moe/blockwise_scaled_group_mm_sm100.cu | 374 +++++++++ .../quantization/machete/machete_mainloop.cuh | 1 - csrc/quantization/vectorization_utils.cuh | 97 +++ csrc/sampler.cu | 2 + csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh | 3 +- csrc/torch_bindings.cpp | 8 + docker/Dockerfile | 12 + docs/contributing/incremental_build.md | 2 +- docs/features/multimodal_inputs.md | 45 +- docs/models/supported_models.md | 22 +- examples/offline_inference/mistral-small.py | 7 +- .../offline_inference/profiling_tpu/README.md | 5 +- examples/offline_inference/spec_decode.py | 1 + examples/offline_inference/vision_language.py | 1 + .../vision_language_multi_image.py | 1 + .../disagg_proxy_p2p_nccl_xpyd.py | 1 + .../multi_instance_data_parallel.py | 1 + ..._chat_completion_client_with_tools_xlam.py | 1 + ...letion_client_with_tools_xlam_streaming.py | 1 + .../structured_outputs/README.md | 10 +- examples/others/tensorize_vllm_model.py | 12 +- requirements/kv_connectors.txt | 1 + tests/compile/test_fusion_attn.py | 1 + tests/config/test_mp_reducer.py | 57 ++ .../openai/test_transcription_validation.py | 39 +- tests/entrypoints/test_chat_utils.py | 12 +- tests/kernels/attention/test_attention.py | 3 +- .../attention/test_attention_selector.py | 28 + tests/kernels/moe/parallel_utils.py | 11 +- tests/kernels/moe/test_batched_moe.py | 84 +- .../kernels/moe/test_cutlass_grouped_gemm.py | 116 +++ tests/kernels/moe/test_deepep_deepgemm_moe.py | 4 +- tests/kernels/moe/test_deepep_moe.py | 6 +- tests/kernels/moe/test_deepgemm.py | 1 + tests/kernels/moe/test_pplx_cutlass_moe.py | 79 +- tests/kernels/moe/test_pplx_moe.py | 732 ++++++++++------ tests/kernels/moe/utils.py | 14 +- tests/kernels/quant_utils.py | 18 + .../test_apply_repetition_penalties.py | 49 ++ tests/kernels/test_flex_attention.py | 1 + tests/kernels/utils.py | 10 +- tests/lora/conftest.py | 17 - .../models/language/generation/test_common.py | 8 +- .../models/language/generation/test_hybrid.py | 70 +- .../models/language/pooling/test_embedding.py | 33 +- tests/models/language/pooling/test_gte.py | 16 +- .../models/language/pooling/test_intfloat.py | 1 + .../language/pooling/test_mxbai_rerank.py | 84 ++ tests/models/language/pooling/test_reward.py | 5 - tests/models/registry.py | 2 +- tests/quantization/test_compressed_tensors.py | 3 +- tests/quantization/test_rtn.py | 1 + tests/test_utils.py | 50 +- tests/tool_use/test_minimax_tool_parser.py | 1 + tests/tool_use/test_xlam_tool_parser.py | 1 + tests/v1/core/test_scheduler.py | 57 +- tests/v1/engine/test_llm_engine.py | 62 +- tests/v1/sample/test_logits_processors.py | 1 + tests/v1/test_oracle.py | 1 - tests/v1/test_request.py | 1 + .../v1/tpu/test_spmd_model_weight_loading.py | 1 + tests/v1/tpu/test_tpu_qkv_linear.py | 1 + tests/v1/worker/test_gpu_model_runner.py | 5 + tools/check_pickle_imports.py | 1 + tools/check_spdx_header.py | 147 +++- vllm/_custom_ops.py | 14 + vllm/attention/backends/torch_sdpa.py | 546 ++++++++++++ vllm/attention/ops/ipex_attn.py | 195 +++++ vllm/compilation/fusion_attn.py | 1 + vllm/config.py | 13 +- vllm/distributed/eplb/__init__.py | 1 + vllm/distributed/eplb/eplb_state.py | 1 + vllm/distributed/eplb/rebalance_algo.py | 1 + vllm/distributed/eplb/rebalance_execute.py | 1 + .../kv_transfer/kv_connector/utils.py | 4 +- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 1 + .../kv_connector/v1/p2p/p2p_nccl_engine.py | 1 + .../kv_connector/v1/p2p/tensor_memory_pool.py | 1 + .../kv_transfer/kv_pipe/mooncake_pipe.py | 48 +- vllm/distributed/tpu_distributed_utils.py | 1 + vllm/engine/arg_utils.py | 9 - vllm/entrypoints/chat_utils.py | 48 +- vllm/entrypoints/cli/benchmark/main.py | 1 + vllm/entrypoints/openai/api_server.py | 2 + vllm/entrypoints/openai/protocol.py | 59 +- vllm/entrypoints/openai/serving_chat.py | 18 +- vllm/entrypoints/openai/serving_completion.py | 16 +- .../openai/tool_parsers/xlam_tool_parser.py | 3 +- vllm/entrypoints/utils.py | 22 +- .../layers/fused_moe/batched_deep_gemm_moe.py | 15 +- .../batched_triton_or_deep_gemm_moe.py | 35 +- .../model_executor/layers/fused_moe/config.py | 64 +- .../layers/fused_moe/cpu_fused_moe.py | 1 + .../layers/fused_moe/cutlass_moe.py | 157 +++- .../fused_moe/deepep_ht_prepare_finalize.py | 24 +- .../fused_moe/deepep_ll_prepare_finalize.py | 16 +- .../layers/fused_moe/fused_batched_moe.py | 583 ++++++++----- .../layers/fused_moe/fused_marlin_moe.py | 6 +- .../layers/fused_moe/fused_moe.py | 64 +- vllm/model_executor/layers/fused_moe/layer.py | 43 +- .../layers/fused_moe/modular_kernel.py | 4 + .../layers/fused_moe/pplx_prepare_finalize.py | 95 ++- .../layers/fused_moe/prepare_finalize.py | 3 + vllm/model_executor/layers/fused_moe/utils.py | 37 +- .../layers/mamba/ops/mamba_ssm.py | 2 +- .../layers/quantization/awq_marlin.py | 6 +- .../compressed_tensors_moe.py | 57 +- .../schemes/compressed_tensors_w4a4_nvfp4.py | 1 + .../layers/quantization/deepgemm.py | 1 + .../model_executor/layers/quantization/fp8.py | 28 +- .../layers/quantization/gptq_marlin.py | 5 +- .../layers/quantization/modelopt.py | 1 + .../model_executor/layers/quantization/rtn.py | 1 + .../model_loader/tensorizer_loader.py | 8 +- vllm/model_executor/model_loader/tpu.py | 1 + vllm/model_executor/models/adapters.py | 102 ++- vllm/model_executor/models/aya_vision.py | 4 +- vllm/model_executor/models/bamba.py | 45 +- vllm/model_executor/models/config.py | 2 +- vllm/model_executor/models/dots1.py | 1 + vllm/model_executor/models/falcon_h1.py | 57 +- vllm/model_executor/models/glm4_1v.py | 28 +- .../model_executor/models/granitemoehybrid.py | 49 +- vllm/model_executor/models/nemotron_h.py | 45 +- vllm/model_executor/models/qwen3.py | 119 +-- vllm/model_executor/models/qwen3_moe.py | 2 +- vllm/model_executor/models/tarsier.py | 1 + vllm/model_executor/models/whisper.py | 5 +- vllm/model_executor/models/zamba2.py | 101 ++- vllm/multimodal/utils.py | 67 +- vllm/platforms/cuda.py | 4 + vllm/platforms/interface.py | 4 + vllm/transformers_utils/config.py | 59 +- vllm/transformers_utils/configs/__init__.py | 4 - vllm/transformers_utils/configs/h2ovl.py | 16 - vllm/transformers_utils/configs/internvl.py | 54 -- vllm/transformers_utils/configs/nemotron_h.py | 1 + vllm/transformers_utils/configs/nvlm_d.py | 20 +- vllm/usage/usage_lib.py | 5 +- vllm/utils/__init__.py | 32 +- vllm/v1/attention/backends/cpu_attn.py | 780 +----------------- vllm/v1/attention/backends/flex_attention.py | 13 +- vllm/v1/attention/backends/mla/cutlass_mla.py | 1 + vllm/v1/attention/backends/rocm_aiter_fa.py | 1 + vllm/v1/attention/backends/utils.py | 2 +- vllm/v1/core/kv_cache_coordinator.py | 12 +- vllm/v1/core/kv_cache_manager.py | 16 +- vllm/v1/core/kv_cache_utils.py | 6 +- vllm/v1/engine/processor.py | 5 + vllm/v1/kv_cache_interface.py | 7 +- vllm/v1/pool/metadata.py | 1 + vllm/v1/sample/logits_processor.py | 1 + vllm/v1/sample/ops/topk_topp_sampler.py | 5 +- vllm/v1/structured_output/__init__.py | 35 +- vllm/v1/worker/cpu_model_runner.py | 1 + vllm/v1/worker/cpu_worker.py | 1 + vllm/v1/worker/gpu_model_runner.py | 125 ++- vllm/v1/worker/xpu_model_runner.py | 1 + vllm/v1/worker/xpu_worker.py | 1 + 169 files changed, 4483 insertions(+), 2150 deletions(-) create mode 100644 csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu create mode 100644 requirements/kv_connectors.txt create mode 100644 tests/config/test_mp_reducer.py create mode 100644 tests/kernels/moe/test_cutlass_grouped_gemm.py create mode 100644 tests/models/language/pooling/test_mxbai_rerank.py create mode 100644 vllm/attention/backends/torch_sdpa.py create mode 100644 vllm/attention/ops/ipex_attn.py delete mode 100644 vllm/transformers_utils/configs/h2ovl.py delete mode 100644 vllm/transformers_utils/configs/internvl.py diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index ee13e1aabc89..6314afd65234 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -52,7 +52,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - label: "Annotate release workflow" diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index afe3e4b7ef69..737b2eede9c6 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -48,16 +48,10 @@ function cpu_tests() { # Run basic model test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - # Note: disable until supports V1 - # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model - # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model - - # Note: disable Bart until supports V1 - pytest -v -s tests/models/language/generation -m cpu_model \ - --ignore=tests/models/language/generation/test_bart.py - VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \ - --ignore=tests/models/language/generation/test_bart.py - + pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model + pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model + pytest -v -s tests/models/language/generation -m cpu_model + VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model pytest -v -s tests/models/language/pooling -m cpu_model pytest -v -s tests/models/multimodal/generation \ --ignore=tests/models/multimodal/generation/test_mllama.py \ @@ -68,15 +62,21 @@ function cpu_tests() { docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" - # Note: disable it until supports V1 # Run AWQ test # docker exec cpu-test-"$NUMA_NODE" bash -c " # set -e # VLLM_USE_V1=0 pytest -s -v \ # tests/quantization/test_ipex_quant.py" + # Run chunked-prefill and prefix-cache test + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -s -v -k cpu_model \ + tests/basic_correctness/test_chunked_prefill.py" + # online serving docker exec cpu-test-"$NUMA_NODE" bash -c " set -e diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 175269e857e0..148cf8074232 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -217,7 +217,7 @@ steps: ##### 1 GPU test ##### - label: Regression Test # 5min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/test_regression @@ -227,7 +227,7 @@ steps: working_dir: "/vllm-workspace/tests" # optional - label: Engine Test # 10min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/engine @@ -340,7 +340,7 @@ steps: parallelism: 4 - label: PyTorch Compilation Unit Tests - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ @@ -422,7 +422,7 @@ steps: - pytest -v -s kernels/mamba - label: Tensorizer Test # 11min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] soft_fail: true source_file_dependencies: - vllm/model_executor/model_loader @@ -514,7 +514,7 @@ steps: ##### models test ##### - label: Basic Models Test # 24min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ @@ -603,7 +603,7 @@ steps: - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - label: Multi-Modal Models Test (Extended) 3 - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index da7f89747a16..2acb03d52a67 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -16,7 +16,7 @@ /vllm/lora @jeejeelee /vllm/reasoning @aarnphm /vllm/entrypoints @aarnphm -CMakeLists.txt @tlrmchlsmth +CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Any change to the VllmConfig changes can have a large user-facing impact, # so spam a lot of people diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 64011922ad82..74a7a3a3530f 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -68,7 +68,7 @@ jobs: export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" & - helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" + helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b1daeeed83e..0129f85123fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -259,7 +259,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. - set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use") + set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) @@ -615,6 +615,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "in CUDA target architectures.") endif() endif() + + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) + set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${SCALED_MM_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1") + message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}") + else() + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) + message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is " + "not >= 12.8, we recommend upgrading to CUDA 12.8 or later " + "if you intend on running FP8 quantized MoE models on Blackwell.") + else() + message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found " + "in CUDA target architectures") + endif() + endif() # # Machete kernels diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py index d17443871cf6..920961899038 100644 --- a/benchmarks/kernels/bench_fp8_gemm.py +++ b/benchmarks/kernels/bench_fp8_gemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy import itertools diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp index d922a3349e1e..ce7f47cf7233 100644 --- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp +++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp @@ -45,7 +45,6 @@ #include "cute/algorithm/functional.hpp" #include "cute/atom/mma_atom.hpp" #include "cute/algorithm/gemm.hpp" -#include "cute/tensor_predicate.hpp" #include "cute/numeric/arithmetic_tuple.hpp" #include "cutlass_extensions/gemm/dispatch_policy.hpp" diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index 87117a165fe9..5cd2ac179768 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -162,10 +162,11 @@ __global__ void dynamic_scaled_int8_quant_kernel( // calculate for absmax float thread_max = 0.f; - for (int i = tid; i < hidden_size; i += stride) { - const auto v = fabsf(static_cast(row_in[i])); - thread_max = fmaxf(thread_max, v); - } + vectorize_read_with_alignment<16>( + row_in, hidden_size, tid, stride, [&] __device__(const scalar_t& src) { + const float v = fabsf(static_cast(src)); + thread_max = fmaxf(thread_max, v); + }); using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage tmp; float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x); @@ -232,9 +233,10 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel( // 1. calculate min & max MinMax thread_mm; - for (int i = tid; i < hidden_size; i += stride) { - thread_mm += static_cast(row_in[i]); - } + vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride, + [&] __device__(const scalar_t& src) { + thread_mm += static_cast(src); + }); using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage tmp; diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh index 2387ec57e8f2..2d67da98763e 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh @@ -51,7 +51,8 @@ struct cutlass_3x_gemm { // These are the minimum alignments needed for the kernels to compile static constexpr int AlignmentAB = 128 / cutlass::sizeof_bits::value; - static constexpr int AlignmentCD = 4; + static constexpr int AlignmentCD = + 128 / cutlass::sizeof_bits::value; using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< diff --git a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu new file mode 100644 index 000000000000..236d76ed5208 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu @@ -0,0 +1,374 @@ +#include "core/registration.h" + +#include +#include + +#include +#include +#include + +#include "cute/tensor.hpp" +#include "cutlass/tensor_ref.h" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/group_array_problem_shape.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/device/gemm.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/gett.hpp" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include + +using namespace cute; + +template +__global__ void get_ggemm_starts( + int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets, + ElementC** out_offsets, ElementAccumulator** a_scale_offsets, + ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int, + ElementAB* b_base_as_int, ElementC* out_base_as_int, + ElementAccumulator* a_scale_base_as_int, + ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int, + LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) { + int expert_id = threadIdx.x; + + if (expert_id >= gridDim.x * blockDim.x) { + return; + } + + int m = problem_sizes[expert_id * 3]; + int n = problem_sizes[expert_id * 3 + 1]; + int k = problem_sizes[expert_id * 3 + 2]; + + int32_t expert_offset = expert_offsets[expert_id]; + int a_stride = expert_offset * k; + int b_stride = expert_id * k * n; + int a_scale_stride = expert_offset * k / 128; + int b_scale_stride = expert_id * k * n / 128 / 128; + + a_offsets[expert_id] = a_base_as_int + a_stride; + b_offsets[expert_id] = b_base_as_int + b_stride; + out_offsets[expert_id] = out_base_as_int + expert_offset * n; + a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride; + b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride; + + LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id; + LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id; + + *layout_sfa_ptr = + ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1)); + *layout_sfb_ptr = + ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1)); +} + +#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \ + ScaleConfig) \ + else if (out_tensors.dtype() == TENSOR_C_TYPE) { \ + get_ggemm_starts<<<1, num_experts, 0, stream>>>( \ + static_cast(expert_offsets.data_ptr()), \ + static_cast(a_ptrs.data_ptr()), \ + static_cast(b_ptrs.data_ptr()), \ + static_cast(out_ptrs.data_ptr()), \ + static_cast(a_scales_ptrs.data_ptr()), \ + static_cast(b_scales_ptrs.data_ptr()), \ + static_cast(a_tensors.data_ptr()), \ + static_cast(b_tensors.data_ptr()), \ + static_cast(out_tensors.data_ptr()), \ + static_cast(a_scales.data_ptr()), \ + static_cast(b_scales.data_ptr()), \ + reinterpret_cast(layout_sfa.data_ptr()), \ + reinterpret_cast(layout_sfb.data_ptr()), \ + static_cast(problem_sizes.data_ptr())); \ + } + +template +void run_get_ggemm_starts( + torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs, + torch::Tensor& b_ptrs, torch::Tensor& out_ptrs, + torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs, + torch::Tensor const& a_tensors, torch::Tensor const& b_tensors, + torch::Tensor out_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& layout_sfa, + torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) { + TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(a_scales.dtype() == torch::kFloat32); + TORCH_CHECK(b_scales.dtype() == torch::kFloat32); + TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0); + TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0); + + int num_experts = (int)expert_offsets.size(0); + auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index()); + + if (false) { + } + __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA, + LayoutSFB, ScaleConfig) + __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA, + LayoutSFB, ScaleConfig) + else { + TORCH_CHECK(false, "Unsupported output tensor type"); + } +} + +template +void run_blockwise_scaled_group_mm( + torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs, + const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs, + const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a, + const torch::Tensor& stride_b, const torch::Tensor& stride_c, + const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb, + const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) { + using ProblemShape = cutlass::gemm::GroupProblemShape>; + + // Types + using ElementA = cutlass::float_e4m3_t; + using ElementB = cutlass::float_e4m3_t; + using ElementC = OutType; + using ElementD = ElementC; + using ElementAccumulator = float; + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = LayoutD; + + // Alignments + static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; + static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; + static constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; + + using ArchTag = cutlass::arch::Sm100; + using OperatorClass = cutlass::arch::OpClassTensorOp; + + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape, + typename ScheduleConfig::ClusterShape, + cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator, + ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*, + AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp; + + using CollectiveMainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, ElementA, + cute::tuple, + AlignmentA, ElementB, + cute::tuple, + AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape, + typename ScheduleConfig::ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + typename ScheduleConfig::KernelSchedule>::CollectiveOp; + + using GemmKernel = + cutlass::gemm::kernel::GemmUniversal; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + using StrideA = typename Gemm::GemmKernel::InternalStrideA; + using StrideB = typename Gemm::GemmKernel::InternalStrideB; + using StrideC = typename Gemm::GemmKernel::InternalStrideC; + using StrideD = typename Gemm::GemmKernel::InternalStrideD; + + using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape; + int num_experts = (int)expert_offsets.size(0); + + Gemm gemm_op; + + // Mainloop Arguments + typename GemmKernel::MainloopArguments mainloop_args{ + static_cast(a_ptrs.data_ptr()), + static_cast(stride_a.data_ptr()), + static_cast(b_ptrs.data_ptr()), + static_cast(stride_b.data_ptr()), + static_cast(a_scales_ptrs.data_ptr()), + reinterpret_cast( + layout_sfa.data_ptr()), + static_cast(b_scales_ptrs.data_ptr()), + reinterpret_cast( + layout_sfb.data_ptr())}; + + cutlass::KernelHardwareInfo hw_info; + hw_info.device_id = a_ptrs.get_device(); + hw_info.sm_count = + cutlass::KernelHardwareInfo::query_device_multiprocessor_count( + hw_info.device_id); + + // Epilogue Arguments + typename GemmKernel::EpilogueArguments epilogue_args{ + {}, // epilogue.thread + nullptr, + static_cast(stride_c.data_ptr()), + static_cast(out_ptrs.data_ptr()), + static_cast(stride_c.data_ptr())}; + + UnderlyingProblemShape* problem_sizes_as_shapes = + static_cast(problem_sizes.data_ptr()); + + // Gemm Arguments + typename GemmKernel::Arguments args{ + cutlass::gemm::GemmUniversalMode::kGrouped, + {num_experts, problem_sizes_as_shapes, nullptr}, + mainloop_args, + epilogue_args, + hw_info}; + + at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()}; + const cudaStream_t stream = + at::cuda::getCurrentCUDAStream(a_ptrs.get_device()); + + auto can_implement_status = gemm_op.can_implement(args); + TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess, + "Failed to implement GEMM"); + + size_t workspace_size = gemm_op.get_workspace_size(args); + auto const workspace_options = + torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device()); + auto workspace = torch::empty(workspace_size, workspace_options); + + auto status = gemm_op.initialize(args, workspace.data_ptr(), stream); + TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM"); + + status = gemm_op.run(stream); + TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM"); +} + +template +void blockwise_scaled_group_mm_dispatch_shape( + torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, + const torch::Tensor& scales_a, const torch::Tensor& scales_b, + const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) { + struct MmaConfig { + using ElementA = cutlass::float_e4m3_t; + using KernelSchedule = + cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100; + using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; + using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig< + 1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>; + using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); + using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); + using LayoutC = cutlass::layout::RowMajor; + using MmaTileShape = Shape<_128, _128, _128>; + using ClusterShape = Shape<_1, _1, _1>; + }; + + int num_experts = (int)expert_offsets.size(0); + + auto a_ptrs = torch::empty( + {num_experts}, + torch::TensorOptions().dtype(torch::kInt64).device(a.device())); + auto b_ptrs = torch::empty( + {num_experts}, + torch::TensorOptions().dtype(torch::kInt64).device(a.device())); + auto out_ptrs = torch::empty( + {num_experts}, + torch::TensorOptions().dtype(torch::kInt64).device(a.device())); + auto a_scales_ptrs = torch::empty( + {num_experts}, + torch::TensorOptions().dtype(torch::kInt64).device(a.device())); + auto b_scales_ptrs = torch::empty( + {num_experts}, + torch::TensorOptions().dtype(torch::kInt64).device(a.device())); + + auto layout_sfa = torch::empty( + {num_experts, 5}, + torch::TensorOptions().dtype(torch::kInt32).device(a.device())); + auto layout_sfb = torch::empty( + {num_experts, 5}, + torch::TensorOptions().dtype(torch::kInt32).device(a.device())); + + auto stride_a = torch::full( + {num_experts}, a.size(1), + torch::TensorOptions().dtype(torch::kInt64).device(a.device())); + auto stride_b = torch::full( + {num_experts}, a.size(1), + torch::TensorOptions().dtype(torch::kInt64).device(a.device())); + auto stride_c = torch::full( + {num_experts}, output.size(1), + torch::TensorOptions().dtype(torch::kInt64).device(a.device())); + + torch::TensorOptions options_int = + torch::TensorOptions().dtype(torch::kInt64).device(a.device()); + + run_get_ggemm_starts( + expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a, + b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes); + + run_blockwise_scaled_group_mm( + out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a, + stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes, + expert_offsets); +} + +void cutlass_blockwise_scaled_grouped_mm( + torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, + const torch::Tensor& scales_a, const torch::Tensor& scales_b, + const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) { + TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor"); + TORCH_CHECK(problem_sizes.size(1) == 3, + "problem_sizes must have shape (num_experts, 3)"); + TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0), + "Number of experts in problem_sizes must match expert_offsets"); + TORCH_CHECK(problem_sizes.dtype() == torch::kInt32, + "problem_sizes must be int32"); + TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn, + "a must be kFloat8_e4m3fn"); + TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn, + "b must be kFloat8_e4m3fn"); + TORCH_CHECK(output.scalar_type() == torch::kBFloat16 || + output.scalar_type() == torch::kHalf, + "output must be bfloat16 or half"); + TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, + "scales_a must be float32"); + TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, + "scales_b must be float32"); + TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32, + "expert_offsets must be int32"); + + TORCH_CHECK(output.dim() == 2, "output must be 2D tensor"); + TORCH_CHECK(a.dim() == 2, "a must be 2D tensor"); + TORCH_CHECK(b.dim() == 3, "b must be 3D tensor"); + TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor"); + TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor"); + TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor"); + TORCH_CHECK(problem_sizes.size(1) == 3, + "problem_sizes must have shape (num_experts, 3)"); + TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0), + "Number of experts in problem_sizes must match expert_offsets"); + TORCH_CHECK(problem_sizes.dtype() == torch::kInt32, + "problem_sizes must be int32"); + TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor"); + +#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100 + if (output.scalar_type() == torch::kBFloat16) { + blockwise_scaled_group_mm_dispatch_shape( + output, a, b, scales_a, scales_b, problem_sizes, expert_offsets); + } else if (output.scalar_type() == torch::kFloat16) { + blockwise_scaled_group_mm_dispatch_shape( + output, a, b, scales_a, scales_b, problem_sizes, expert_offsets); + } else { + TORCH_CHECK(false, "Unsupported output tensor type"); + } +#endif +} + +TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { + m.impl("cutlass_blockwise_scaled_grouped_mm", + &cutlass_blockwise_scaled_grouped_mm); +} diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh index eca5d328b00c..2f52a6b7a024 100644 --- a/csrc/quantization/machete/machete_mainloop.cuh +++ b/csrc/quantization/machete/machete_mainloop.cuh @@ -38,7 +38,6 @@ #include "cute/atom/mma_atom.hpp" #include "cute/atom/copy_traits_sm90_tma.hpp" #include "cute/algorithm/gemm.hpp" -#include "cute/tensor_predicate.hpp" #include "cute/numeric/arithmetic_tuple.hpp" #include "cutlass/pipeline/pipeline.hpp" #include "cutlass/transform/collective/sm90_wgmma_transpose.hpp" diff --git a/csrc/quantization/vectorization_utils.cuh b/csrc/quantization/vectorization_utils.cuh index 8d3c1d6d3b9f..8aa0147df6ba 100644 --- a/csrc/quantization/vectorization_utils.cuh +++ b/csrc/quantization/vectorization_utils.cuh @@ -27,6 +27,26 @@ __device__ inline void vectorize_with_alignment( constexpr int WIDTH = VEC_SIZE * sizeof(InT); // eg: 64 B uintptr_t addr = reinterpret_cast(in); + // fast path when the whole region is already aligned + // Note: currently the output is guaranteed to be same as the input, so we + // don't check it here, comments here just for future reference. + bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0); + if (can_vec) { + int num_vec = len / VEC_SIZE; + + using vin_t = vec_n_t; + using vout_t = vec_n_t; + auto* v_in = reinterpret_cast(in); + auto* v_out = reinterpret_cast(out); + + for (int i = tid; i < num_vec; i += stride) { + vout_t tmp; + vec_op(tmp, v_in[i]); + v_out[i] = tmp; + } + return; + } + int misalignment_offset = addr & (WIDTH - 1); // addr % 64 int alignment_bytes = WIDTH - misalignment_offset; // 64 - (addr % 64) int prefix_elems = alignment_bytes & (WIDTH - 1); // handle 64 @@ -72,4 +92,81 @@ __device__ __forceinline__ void vectorize_with_alignment(const InT* in, std::forward(scalar_op)); } +template +struct DefaultReadVecOp { + ScaOp scalar_op; + + __device__ __forceinline__ void operator()( + const vec_n_t& src) const { +#pragma unroll + for (int i = 0; i < VEC_SIZE; ++i) { + scalar_op(src.val[i]); + } + } +}; + +// read-only version: iterate over the input with alignment guarantees +template +__device__ inline void vectorize_read_with_alignment(const InT* in, int len, + int tid, int stride, + VecOp&& vec_op, + ScaOp&& scalar_op) { + static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0, + "VEC_SIZE must be a positive power-of-two"); + constexpr int WIDTH = VEC_SIZE * sizeof(InT); + uintptr_t addr = reinterpret_cast(in); + + // fast path when the whole region is already aligned + bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0); + if (can_vec) { + int num_vec = len / VEC_SIZE; + + using vin_t = vec_n_t; + auto* v_in = reinterpret_cast(in); + + for (int i = tid; i < num_vec; i += stride) { + vec_op(v_in[i]); + } + return; + } + + int misalignment_offset = addr & (WIDTH - 1); + int alignment_bytes = WIDTH - misalignment_offset; + int prefix_elems = alignment_bytes & (WIDTH - 1); + prefix_elems /= sizeof(InT); + prefix_elems = min(prefix_elems, len); + + // 1. handle the possibly unaligned prefix with scalar access. + for (int i = tid; i < prefix_elems; i += stride) { + scalar_op(in[i]); + } + + in += prefix_elems; + len -= prefix_elems; + + int num_vec = len / VEC_SIZE; + using vin_t = vec_n_t; + auto* v_in = reinterpret_cast(in); + + // 2. vectorized traversal of the main aligned region. + for (int i = tid; i < num_vec; i += stride) { + vec_op(v_in[i]); + } + + // 3. handle remaining tail elements. + int tail_start = num_vec * VEC_SIZE; + for (int i = tid + tail_start; i < len; i += stride) { + scalar_op(in[i]); + } +} + +// overload that requires only a scalar_op +template +__device__ __forceinline__ void vectorize_read_with_alignment( + const InT* in, int len, int tid, int stride, ScaOp&& scalar_op) { + using Vec = DefaultReadVecOp>; + vectorize_read_with_alignment(in, len, tid, stride, Vec{scalar_op}, + std::forward(scalar_op)); +} + } // namespace vllm diff --git a/csrc/sampler.cu b/csrc/sampler.cu index ee5793dda0ef..b0cce2e98d22 100644 --- a/csrc/sampler.cu +++ b/csrc/sampler.cu @@ -59,6 +59,8 @@ void apply_repetition_penalties_( int vocab_size = logits.size(-1); int num_seqs = logits.size(0); + if (num_seqs == 0) return; + // Get number of SMs on the current device int sms = 0; cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh index c22523da4e43..637bba1384a4 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh +++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh @@ -79,7 +79,8 @@ struct cutlass_sparse_3x_gemm { // These are the minimum alignments needed for the kernels to compile static constexpr int AlignmentAB = 128 / cutlass::sizeof_bits::value; - static constexpr int AlignmentCD = 4; + static constexpr int AlignmentCD = + 128 / cutlass::sizeof_bits::value; using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 8bb71cad29da..9414e26196b2 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -393,6 +393,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { {stride_tag}); ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm); + // cutlass blockwise scaledgroup GEMM + ops.def( + "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, " + "Tensor scales_a, Tensor scales_b, " + "Tensor problem_sizes, Tensor expert_offsets) -> ()", + {stride_tag}); + // conditionally compiled so impl registration is in source file + // cutlass nvfp4 block scaled group GEMM ops.def( "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b," diff --git a/docker/Dockerfile b/docker/Dockerfile index ec18c45a096a..c49b5da2714c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,3 +1,4 @@ + # The vLLM Dockerfile is used to construct vLLM image that can be directly used # to run the OpenAI compatible server. @@ -62,12 +63,16 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly ARG PIP_KEYRING_PROVIDER=disabled ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER} +# Flag enables build-in KV-connector dependency libs into docker images +ARG INSTALL_KV_CONNECTORS=false + #################### BASE BUILD IMAGE #################### # prepare basic build environment FROM ${BUILD_BASE_IMAGE} AS base ARG CUDA_VERSION ARG PYTHON_VERSION ARG TARGETPLATFORM +ARG INSTALL_KV_CONNECTORS=false ENV DEBIAN_FRONTEND=noninteractive ARG DEADSNAKES_MIRROR_URL @@ -276,6 +281,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ FROM ${FINAL_BASE_IMAGE} AS vllm-base ARG CUDA_VERSION ARG PYTHON_VERSION +ARG INSTALL_KV_CONNECTORS=false WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive ARG TARGETPLATFORM @@ -485,6 +491,7 @@ RUN mv mkdocs.yaml test_docs/ # base openai image with additional requirements, for any subsequent openai-style images FROM vllm-base AS vllm-openai-base ARG TARGETPLATFORM +ARG INSTALL_KV_CONNECTORS=false ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL @@ -493,8 +500,13 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 +COPY requirements/kv_connectors.txt requirements/kv_connectors.txt + # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/uv \ + if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ + uv pip install --system -r requirements/kv_connectors.txt; \ + fi; \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ else \ diff --git a/docs/contributing/incremental_build.md b/docs/contributing/incremental_build.md index 14c3aaead51e..33584fdd5d40 100644 --- a/docs/contributing/incremental_build.md +++ b/docs/contributing/incremental_build.md @@ -14,7 +14,7 @@ Before setting up the incremental build: VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto ``` -2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary. +2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary. 3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager. diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index e3a77afb02f1..ed11d2836037 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis Full example: +If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings: + +```python +from vllm import LLM +from vllm.assets.image import ImageAsset + +llm = LLM(model="llava-hf/llava-1.5-7b-hf") +image_url = "https://picsum.photos/id/32/512/512" +image_pil = ImageAsset('cherry_blossom').pil_image +image_embeds = torch.load(...) + +conversation = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hello! How can I assist you today?"}, + { + "role": "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + },{ + "type": "image_pil", + "image_pil": image_pil + }, { + "type": "image_embeds", + "image_embeds": image_embeds + }, { + "type": "text", + "text": "What's in these images?" + }], + }, +] + +# Perform inference and log output. +outputs = llm.chat(conversation) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: ??? Code @@ -228,7 +271,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions If no default chat template is available, we will first look for a built-in fallback in . If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument. - For certain models, we provide alternative chat templates inside . + For certain models, we provide alternative chat templates inside . For example, VLM2Vec uses which is different from the default one for Phi-3-Vision. ### Image Inputs diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index eb32aa361efd..23d71fd44525 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -470,6 +470,7 @@ Specified using `--task classify`. |----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------| | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | | | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ | + If your model is not in the above list, we will try to automatically convert the model using [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. @@ -477,12 +478,20 @@ If your model is not in the above list, we will try to automatically convert the Specified using `--task score`. -| Architecture | Models | Example HF Models | [V1](gh-issue:8779) | -|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------| -| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | -| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | -| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | -| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | +| Architecture | Models | Example HF Models | [V1](gh-issue:8779) | +|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|---------------------| +| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | +| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | +| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | +| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | +| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | + +!!! note + Load the official original `mxbai-rerank-v2` by using the following command. + + ```bash + vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}' + ``` !!! note Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: . @@ -490,6 +499,7 @@ Specified using `--task score`. ```bash vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' ``` + [](){ #supported-mm-models } ## List of Multimodal Language Models diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index 330103d5818a..a38fc9216d40 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -6,6 +6,7 @@ from vllm import LLM from vllm.sampling_params import SamplingParams +from vllm.assets.image import ImageAsset # This script is an offline demo for running Mistral-Small-3.1 # @@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace): ) prompt = "Describe this image in one sentence." - image_url = "https://picsum.photos/id/237/200/300" messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, - {"type": "image_url", "image_url": {"url": image_url}}, + { + "type": "image_pil", + "image_pil": ImageAsset("cherry_blossom").pil_image, + }, ], }, ] diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md index e0122c05cff1..8c9c1c92b676 100644 --- a/examples/offline_inference/profiling_tpu/README.md +++ b/examples/offline_inference/profiling_tpu/README.md @@ -57,7 +57,10 @@ Once you have collected your profiles with this script, you can visualize them u Here are most likely the dependencies you need to install: ```bash -pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources +pip install tensorflow-cpu \ + tensorboard-plugin-profile \ + etils \ + importlib_resources ``` Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser: diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 3f38aa9fcaa6..26e492fed25f 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from transformers import AutoTokenizer diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index b136b14cd8ea..bf7be33107da 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -677,6 +677,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData: max_num_seqs=2, tensor_parallel_size=2, limit_mm_per_prompt={modality: 1}, + ignore_patterns=["consolidated.safetensors"], ) prompts = [f"[INST]{question}\n[IMG][/INST]" for question in questions] diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 13af8e904194..eb4f3b6c8f44 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -505,6 +505,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData: max_num_seqs=2, tensor_parallel_size=2, limit_mm_per_prompt={"image": len(image_urls)}, + ignore_patterns=["consolidated.safetensors"], ) placeholders = "[IMG]" * len(image_urls) diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py index 73f2caaa0dbd..4e82424d6cd7 100644 --- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py +++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import socket diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py index 62b1ec71af14..cb230913a422 100644 --- a/examples/online_serving/multi_instance_data_parallel.py +++ b/examples/online_serving/multi_instance_data_parallel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from typing import Optional diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py b/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py index 3de5e2b544c8..f0b0a2db44ed 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: E501 """ Set up this example by starting a vLLM OpenAI-compatible server with tool call diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py b/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py index 5847414b1171..94e664c9ec3d 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: E501 """ Set up this example by starting a vLLM OpenAI-compatible server with tool call diff --git a/examples/online_serving/structured_outputs/README.md b/examples/online_serving/structured_outputs/README.md index c9b97f11eefd..d2777a43d478 100644 --- a/examples/online_serving/structured_outputs/README.md +++ b/examples/online_serving/structured_outputs/README.md @@ -13,13 +13,15 @@ vllm serve Qwen/Qwen2.5-3B-Instruct To serve a reasoning model, you can use the following command: ```bash -vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1 +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ + --reasoning-parser deepseek_r1 ``` If you want to run this script standalone with `uv`, you can use the following: ```bash -uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs structured-output +uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \ + structured-output ``` See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information. @@ -44,7 +46,9 @@ uv run structured_outputs.py --stream Run certain constraints, for example `structural_tag` and `regex`, streaming: ```bash -uv run structured_outputs.py --constraint structural_tag regex --stream +uv run structured_outputs.py \ + --constraint structural_tag regex \ + --stream ``` Run all constraints, with reasoning models and streaming: diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 9e1003a5c39d..11233229561b 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -202,7 +202,7 @@ def parse_args(): -def deserialize(): +def deserialize(args, tensorizer_config): if args.lora_path: tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir llm = LLM(model=args.model, @@ -242,7 +242,7 @@ def deserialize(): return llm -if __name__ == '__main__': +def main(): args = parse_args() s3_access_key_id = (getattr(args, 's3_access_key_id', None) @@ -260,8 +260,6 @@ def deserialize(): model_ref = args.model - model_name = model_ref.split("/")[1] - if args.command == "serialize" or args.command == "deserialize": keyfile = args.keyfile else: @@ -309,6 +307,10 @@ def deserialize(): encryption_keyfile = keyfile, **credentials ) - deserialize() + deserialize(args, tensorizer_config) else: raise ValueError("Either serialize or deserialize must be specified.") + + +if __name__ == "__main__": + main() diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt new file mode 100644 index 000000000000..262675a23120 --- /dev/null +++ b/requirements/kv_connectors.txt @@ -0,0 +1 @@ +lmcache \ No newline at end of file diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 5e6679adfbdc..37ec753bbc9e 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import pytest diff --git a/tests/config/test_mp_reducer.py b/tests/config/test_mp_reducer.py new file mode 100644 index 000000000000..ee351cbfa7c1 --- /dev/null +++ b/tests/config/test_mp_reducer.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import sys +from unittest.mock import patch + +from vllm.config import VllmConfig +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.v1.engine.async_llm import AsyncLLM + + +def test_mp_reducer(monkeypatch): + """ + Test that _reduce_config reducer is registered when AsyncLLM is instantiated + without transformers_modules. This is a regression test for + https://github.com/vllm-project/vllm/pull/18640. + """ + + # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value + monkeypatch.setenv('VLLM_USE_V1', '1') + + # Ensure transformers_modules is not in sys.modules + if 'transformers_modules' in sys.modules: + del sys.modules['transformers_modules'] + + with patch('multiprocessing.reducer.register') as mock_register: + engine_args = AsyncEngineArgs( + model="facebook/opt-125m", + max_model_len=32, + gpu_memory_utilization=0.1, + disable_log_stats=True, + disable_log_requests=True, + ) + + async_llm = AsyncLLM.from_engine_args( + engine_args, + start_engine_loop=False, + ) + + assert mock_register.called, ( + "multiprocessing.reducer.register should have been called") + + vllm_config_registered = False + for call_args in mock_register.call_args_list: + # Verify that a reducer for VllmConfig was registered + if len(call_args[0]) >= 2 and call_args[0][0] == VllmConfig: + vllm_config_registered = True + + reducer_func = call_args[0][1] + assert callable( + reducer_func), "Reducer function should be callable" + break + + assert vllm_config_registered, ( + "VllmConfig should have been registered to multiprocessing.reducer" + ) + + async_llm.shutdown() diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index dab14f1d7d03..e1d175d9c6e1 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -37,7 +37,6 @@ async def test_basic_audio(mary_had_lamb): model_name = "openai/whisper-large-v3-turbo" server_args = ["--enforce-eager"] # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. - prompt = "THE FIRST WORDS I SPOKE" with RemoteOpenAIServer(model_name, server_args) as remote_server: client = remote_server.get_async_client() transcription = await client.audio.transcriptions.create( @@ -48,16 +47,6 @@ async def test_basic_audio(mary_had_lamb): temperature=0.0) out = json.loads(transcription)['text'] assert "Mary had a little lamb," in out - # This should "force" whisper to continue prompt in all caps - transcription_wprompt = await client.audio.transcriptions.create( - model=model_name, - file=mary_had_lamb, - language="en", - response_format="text", - prompt=prompt, - temperature=0.0) - out_capital = json.loads(transcription_wprompt)['text'] - assert prompt not in out_capital @pytest.mark.asyncio @@ -238,3 +227,31 @@ async def test_sampling_params(mary_had_lamb): extra_body=dict(seed=42)) assert greedy_transcription.text != transcription.text + + +@pytest.mark.asyncio +async def test_audio_prompt(mary_had_lamb): + model_name = "openai/whisper-large-v3-turbo" + server_args = ["--enforce-eager"] + prompt = "This is a speech, recorded in a phonograph." + with RemoteOpenAIServer(model_name, server_args) as remote_server: + #Prompts should not omit the part of original prompt while transcribing. + prefix = "The first words I spoke in the original phonograph" + client = remote_server.get_async_client() + transcription = await client.audio.transcriptions.create( + model=model_name, + file=mary_had_lamb, + language="en", + response_format="text", + temperature=0.0) + out = json.loads(transcription)['text'] + assert prefix in out + transcription_wprompt = await client.audio.transcriptions.create( + model=model_name, + file=mary_had_lamb, + language="en", + response_format="text", + prompt=prompt, + temperature=0.0) + out_prompt = json.loads(transcription_wprompt)['text'] + assert prefix in out_prompt diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 49294664275a..e41ea686e992 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images( "url": image_url } }, { - "type": "image_url", - "image_url": { - "url": image_url - } + "type": "image_pil", + "image_pil": ImageAsset('cherry_blossom').pil_image }, { "type": "text", "text": "What's in these images?" @@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async( "url": image_url } }, { - "type": "image_url", - "image_url": { - "url": image_url - } + "type": "image_pil", + "image_pil": ImageAsset('cherry_blossom').pil_image }, { "type": "text", "text": "What's in these images?" diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 7269d19183bf..2e0b4efebfdb 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -450,7 +450,8 @@ def test_multi_query_kv_attention( start += seq_len # xformers.AttentionBias to Tensor for use in reference impl. alibi_bias = [ - b.materialize(b.shape, device=device).squeeze() for b in attn_bias + b.materialize((1, num_query_heads, i, i), device=device).squeeze() + for b, i in zip(attn_bias, seq_lens) ] else: attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens) diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 73087e09e730..3ee2a965c58b 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -182,6 +182,34 @@ def test_env( assert backend.get_name() == expected +@pytest.mark.parametrize("device", ["cpu", "cuda"]) +@pytest.mark.parametrize("use_v1", [True, False]) +def test_fp32_fallback( + device: str, + use_v1: bool, + monkeypatch: pytest.MonkeyPatch, +): + """Test attention backend selection with fp32.""" + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + + if device == "cpu": + with patch("vllm.attention.selector.current_platform", + CpuPlatform()): + backend = get_attn_backend(16, torch.float32, torch.float32, + 16, False) + assert (backend.get_name() == "TORCH_SDPA_VLLM_V1" + if use_v1 else "TORCH_SDPA") + + elif device == "cuda": + with patch("vllm.attention.selector.current_platform", + CudaPlatform()): + backend = get_attn_backend(16, torch.float32, torch.float32, + 16, False) + assert (backend.get_name() == "FLEX_ATTENTION" + if use_v1 else "XFORMERS") + + def test_flash_attn(monkeypatch: pytest.MonkeyPatch): """Test FlashAttn validation.""" # TODO: When testing for v1, pipe in `use_v1` as an argument to diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py index 7797e4f0c9c0..f4049eb0d095 100644 --- a/tests/kernels/moe/parallel_utils.py +++ b/tests/kernels/moe/parallel_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ DeepEP test utilities """ @@ -137,8 +138,7 @@ def make_deepep_ht_a2a(pg: ProcessGroup, low_latency_mode=low_latency_mode, num_qps_per_rank=num_qps_per_rank) return DeepEPHTPrepareAndFinalize(buffer=buffer, - world_size=pgi.world_size, - rank=pgi.rank, + num_dispatchers=pgi.world_size, dp_size=dp_size, rank_expert_offset=pgi.rank * ht_args.num_local_experts) @@ -146,7 +146,6 @@ def make_deepep_ht_a2a(pg: ProcessGroup, def make_deepep_ll_a2a(pg: ProcessGroup, pgi: ProcessGroupInfo, - dp_size: int, deepep_ll_args: DeepEPLLArgs, q_dtype: Optional[torch.dtype] = None, block_shape: Optional[list[int]] = None): @@ -166,8 +165,7 @@ def make_deepep_ll_a2a(pg: ProcessGroup, return DeepEPLLPrepareAndFinalize( buffer=buffer, - world_size=pgi.world_size, - dp_size=dp_size, + num_dispatchers=pgi.world_size, max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank, use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch, ) @@ -186,5 +184,4 @@ def make_deepep_a2a(pg: ProcessGroup, block_shape) assert deepep_ll_args is not None - return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype, - block_shape) + return make_deepep_ll_a2a(pg, pgi, deepep_ll_args, q_dtype, block_shape) diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 779fa1df086d..c9a4375ac939 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -10,7 +10,7 @@ from tests.kernels.moe.utils import (batched_moe, make_quantized_test_activations, - make_test_weights, triton_moe) + make_test_weights, naive_batched_moe) from tests.kernels.quant_utils import native_batched_masked_quant_matmul from tests.kernels.utils import torch_experts from vllm.config import VllmConfig, set_current_vllm_config @@ -33,12 +33,10 @@ (45, 512, 512), (45, 1024, 128), (45, 1024, 2048), - (64, 128, 128), (64, 512, 512), (64, 1024, 2048), (222, 128, 128), (222, 128, 2048), - (222, 512, 512), (222, 1024, 128), (222, 1024, 2048), ] @@ -95,11 +93,12 @@ def make_tensors(config: BatchedMMConfig): @pytest.mark.parametrize("max_tokens_per_expert", [32, 64, 128, 192, 224, 256, 512]) @pytest.mark.parametrize("K", [128, 256, 1024]) -@pytest.mark.parametrize("N", [128, 256, 512, 1024]) -@pytest.mark.parametrize("dtype", - [torch.float32, torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("block_shape", [None]) -@pytest.mark.parametrize("per_act_token_quant", [False]) +@pytest.mark.parametrize("N", [128, 256, 1024]) +@pytest.mark.parametrize( + "dtype", + [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("block_shape", [None, [128, 128]]) +@pytest.mark.parametrize("per_act_token_quant", [False, True]) def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, N: int, dtype: torch.dtype, block_shape: Optional[list[int]], @@ -134,7 +133,8 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, in_dtype=act_dtype, quant_dtype=quant_dtype, block_shape=block_shape, - per_act_token_quant=per_act_token_quant) + per_act_token_quant=per_act_token_quant, + ) B, B_q, B_scale, _, _, _ = make_test_weights( num_experts, @@ -143,6 +143,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, in_dtype=act_dtype, quant_dtype=quant_dtype, block_shape=block_shape, + per_act_token_quant=per_act_token_quant, ) out_shape = (num_experts, max_tokens_per_expert, N) @@ -177,6 +178,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32 }, + per_act_token_quant=per_act_token_quant, block_shape=block_shape, ) @@ -185,15 +187,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, B, ref_output, num_expert_tokens, - None, - None, - None, ) q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output, num_expert_tokens, A_scale, B_scale, - block_shape) + block_shape, + per_act_token_quant) rtol, atol = { torch.float16: (6e-2, 6e-2), @@ -201,16 +201,17 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, torch.float32: (1e-2, 1e-2), }[test_output.dtype] - torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol) + torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol) torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol) @pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) -@pytest.mark.parametrize("dtype", [torch.bfloat16]) -@pytest.mark.parametrize("per_act_token_quant", [False]) -@pytest.mark.parametrize("block_shape", [None]) +@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16]) +@pytest.mark.parametrize("per_act_token_quant", [False, True]) +@pytest.mark.parametrize("block_shape", [None, [128, 128]]) +@pytest.mark.parametrize("input_scales", [False]) def test_fused_moe_batched_experts( m: int, n: int, @@ -220,15 +221,19 @@ def test_fused_moe_batched_experts( dtype: torch.dtype, per_act_token_quant: bool, block_shape: Optional[list[int]], + input_scales: bool, ): current_platform.seed_everything(7) use_fp8_w8a8 = dtype == torch.float8_e4m3fn + if topk > e: + pytest.skip("topk > e") + if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None): pytest.skip("Skip quantization test for non-quantized type") - if per_act_token_quant and block_shape is not None or topk > e: + if per_act_token_quant and block_shape is not None: pytest.skip("Skip illegal quantization test.") a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10 @@ -241,16 +246,27 @@ def test_fused_moe_batched_experts( act_dtype = dtype quant_dtype = None - _, w1, w1_s, _, w2, w2_s = make_test_weights(e, - n, - k, - block_shape=block_shape, - in_dtype=act_dtype, - quant_dtype=quant_dtype) + w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights( + e, + n, + k, + block_shape=block_shape, + in_dtype=act_dtype, + quant_dtype=quant_dtype, + per_act_token_quant=per_act_token_quant, + ) + + if input_scales and quant_dtype is not None: + a1_scale = torch.tensor(1, device="cuda", dtype=torch.float32) + a2_scale = torch.tensor(1, device="cuda", dtype=torch.float32) + else: + a1_scale = None + a2_scale = None with set_current_vllm_config(vllm_config): topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) - batched_output = batched_moe( + + baseline_output = torch_experts( a, w1, w2, @@ -258,11 +274,14 @@ def test_fused_moe_batched_experts( topk_ids, w1_scale=w1_s, w2_scale=w2_s, + a1_scale=a1_scale, + a2_scale=a2_scale, quant_dtype=quant_dtype, per_act_token_quant=per_act_token_quant, block_shape=block_shape, ) - baseline_output = torch_experts( + + batched_output = naive_batched_moe( a, w1, w2, @@ -270,11 +289,14 @@ def test_fused_moe_batched_experts( topk_ids, w1_scale=w1_s, w2_scale=w2_s, + a1_scale=a1_scale, + a2_scale=a2_scale, quant_dtype=quant_dtype, per_act_token_quant=per_act_token_quant, - block_shape=block_shape) + block_shape=block_shape, + ) - triton_output = triton_moe( + triton_output = batched_moe( a, w1, w2, @@ -282,14 +304,16 @@ def test_fused_moe_batched_experts( topk_ids, w1_scale=w1_s, w2_scale=w2_s, + a1_scale=a1_scale, + a2_scale=a2_scale, quant_dtype=quant_dtype, per_act_token_quant=per_act_token_quant, block_shape=block_shape, ) - torch.testing.assert_close(triton_output, + torch.testing.assert_close(batched_output, baseline_output, - atol=2e-2, + atol=3e-2, rtol=2e-2) torch.testing.assert_close(triton_output, diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py new file mode 100644 index 000000000000..67984fe7319a --- /dev/null +++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py @@ -0,0 +1,116 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# DeepGEMM Style Cutlass Grouped GEMM Test +# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py + +import random + +import pytest +import torch + +from tests.kernels.utils import baseline_scaled_mm +from vllm import _custom_ops as ops +from vllm.platforms import current_platform + + +def cdiv(a, b): + return (a + b - 1) // b + + +def per_token_cast_to_fp8( + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + pad_size = (128 - (n % 128)) % 128 + x = torch.nn.functional.pad(x, + (0, pad_size), value=0) if pad_size > 0 else x + x_view = x.view(m, -1, 128) + x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) + fp8_data = (x_view * + (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn) + return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) + + +def per_block_cast_to_fp8( + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128), + device=x.device, + dtype=x.dtype) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn) + return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( + x_amax / 448.0).view(x_view.size(0), x_view.size(2)) + + +@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [ + (4, 8192, 7168, 4096), + (4, 8192, 2048, 7168), + (8, 4096, 7168, 4096), + (8, 4096, 2048, 7168), + (32, 1024, 7168, 4096), + (32, 1024, 2048, 7168), +]) +@pytest.mark.parametrize("out_dtype", [torch.float16]) +@pytest.mark.skipif( + (lambda x: x is None or x.to_int() != 100)( + current_platform.get_device_capability()), + reason="Block Scaled Grouped GEMM is only supported on SM100.") +def test_cutlass_grouped_gemm( + num_groups: int, + expected_m_per_group: int, + k: int, + n: int, + out_dtype: torch.dtype, +): + device = "cuda" + alignment = 128 + group_ms = [ + int(expected_m_per_group * random.uniform(0.7, 1.3)) + for _ in range(num_groups) + ] + m = sum([cdiv(m, alignment) * alignment for m in group_ms]) + + x = torch.randn((m, k), device=device, dtype=out_dtype) + y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype) + out = torch.empty((m, n), device=device, dtype=out_dtype) + ref_out = torch.randn((m, n), device=device, dtype=out_dtype) + + ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m] + pb_size = [] + for i in range(num_groups): + pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k]) + problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32) + expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32) + + x_fp8 = per_token_cast_to_fp8(x) + y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn), + torch.empty((num_groups, cdiv(n, 128), k // 128), + device=device, + dtype=torch.float)) + for i in range(num_groups): + y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i]) + + for i in range(num_groups): + a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]] + a_scale = x_fp8[1][ep_offset[i]:ep_offset[i + 1]] + b = y_fp8[0][i].t() + b_scale = y_fp8[1][i].t() + baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype) + ref_out[ep_offset[i]:ep_offset[i + 1]] = baseline + + ops.cutlass_blockwise_scaled_grouped_mm( + out, + x_fp8[0], + y_fp8[0], + x_fp8[1], + y_fp8[1], + problem_sizes, + expert_offsets[:-1], + ) + + torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3) diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 9b861d4ebc23..b74137eeaaa6 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Test DeepEP + DeepGEMM integration DeepGEMM are gemm kernels specialized for the @@ -148,8 +149,7 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, fused_experts = BatchedDeepGemmExperts( max_num_tokens=max_tokens_per_rank, - world_size=pgi.world_size, - dp_size=dp_size, + num_dispatchers=pgi.world_size // dp_size, block_shape=test_config.block_size, per_act_token_quant=test_config.per_act_token_quant) mk = FusedMoEModularKernel(prepare_finalize=a2a, diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index d7df5bf77035..43804c410b6c 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Test deepep dispatch-combine logic """ @@ -154,12 +155,13 @@ def make_modular_kernel( deepep_ht_args = ht_args, deepep_ll_args = ll_args) + num_dispatchers = pgi.world_size // dp_size + if low_latency_mode: assert not per_act_token_quant, "not supported in ll mode" fused_experts = BatchedTritonExperts( max_num_tokens=MAX_TOKENS_PER_RANK, - world_size=pgi.world_size, - dp_size=dp_size, + num_dispatchers=num_dispatchers, use_fp8_w8a8=is_quantized, use_int8_w8a8=False, use_int8_w8a16=False, diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index 5d2690904cea..fa62507179a2 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Unit-test DeepGEMM FP8 kernels (no DeepEP). Compare DeepGEMM path against the Triton fallback inside vLLM's fused_experts. diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index 184c2dd2f904..e4f4a393dfd5 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) from vllm.platforms import current_platform +from vllm.utils import cdiv from .parallel_utils import ProcessGroupInfo, parallel_launch @@ -112,18 +113,21 @@ def pplx_cutlass_moe( w2_scale = w2_scale.to(device) a1_scale = a1_scale.to(device) + assert num_experts % world_size == 0 + num_local_experts = cdiv(num_experts, world_size) + num_dispatchers = pgi.world_size // dp_size + prepare_finalize = PplxPrepareAndFinalize( ata, - max_num_tokens, - pgi.world_size, - rank, - dp_size, - ) + max_num_tokens=max_num_tokens, + num_local_experts=num_local_experts, + num_dispatchers=num_dispatchers) - experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size, + experts = CutlassExpertsFp8(num_local_experts, out_dtype, per_act_token, per_out_ch, + num_dispatchers=num_dispatchers, use_batched_format=True) fused_cutlass_experts = FusedMoEModularKernel( @@ -181,35 +185,40 @@ def _pplx_moe( per_out_ch: bool, use_internode: bool, ): - if use_internode: - uid = nvshmem_get_unique_id( - ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() - torch.distributed.broadcast(uid, src=0) - nvshmem_init(uid, pgi.rank, pgi.world_size) - else: - group_ranks = list(range(pgi.world_size)) - cpu_group = torch.distributed.new_group(group_ranks, backend="gloo") - group_name = cpu_group.group_name - - with set_current_vllm_config(vllm_config): - torch_output = torch_experts(a_full, w1_full, w2_full, topk_weights, - topk_ids) - pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale, - w2_scale, topk_weights, topk_ids, - a1_scale, out_dtype, per_act_token, - per_out_ch, group_name) - - torch_output = chunk_by_rank(torch_output, pgi.rank, - pgi.world_size).to(pplx_output.device) - - # Uncomment if more debugging is needed - # print("PPLX OUT:", pplx_output) - # print("TORCH OUT:", torch_output) - - torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0) - - if use_internode: - nvshmem_finalize() + try: + if use_internode: + uid = nvshmem_get_unique_id( + ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() + torch.distributed.broadcast(uid, src=0) + nvshmem_init(uid, pgi.rank, pgi.world_size) + else: + group_ranks = list(range(pgi.world_size)) + cpu_group = torch.distributed.new_group(group_ranks, + backend="gloo") + group_name = cpu_group.group_name + + with set_current_vllm_config(vllm_config): + torch_output = torch_experts(a_full, w1_full, w2_full, + topk_weights, topk_ids) + pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale, + w2_scale, topk_weights, topk_ids, + a1_scale, out_dtype, per_act_token, + per_out_ch, group_name) + + torch_output = chunk_by_rank(torch_output, pgi.rank, + pgi.world_size).to(pplx_output.device) + + # Uncomment if more debugging is needed + # print("PPLX OUT:", pplx_output) + # print("TORCH OUT:", torch_output) + + torch.testing.assert_close(pplx_output, + torch_output, + atol=0.05, + rtol=0) + finally: + if use_internode: + nvshmem_finalize() @pytest.mark.parametrize("m", [2, 224]) diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index 186e00800a17..d28e0e040629 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -4,7 +4,10 @@ Run `pytest tests/kernels/test_pplx_moe.py`. """ -from typing import Optional +import itertools +import textwrap +import traceback +from typing import Callable, Optional import pytest import torch @@ -19,12 +22,13 @@ has_pplx = False from tests.kernels.moe.utils import make_test_weights, naive_batched_moe +from tests.kernels.quant_utils import dequant from tests.kernels.utils import torch_experts from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.fused_moe import fused_topk, override_config from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( - BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts) + BatchedTritonExperts) from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) @@ -38,22 +42,22 @@ reason="Requires PPLX kernels", ) -PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512), - (222, 2048, 1024)] - -PPLX_MOE_COMBOS = [ - (1, 128, 128), +PPLX_COMBOS = [ + # TODO: figure out why this fails, seems to be test problem + #(1, 128, 128), (2, 128, 512), (3, 1024, 2048), - (32, 128, 1024), + (4, 128, 128), + (32, 1024, 512), (45, 512, 2048), - (64, 1024, 1024), - (222, 1024, 2048), + (64, 1024, 512), + (222, 2048, 1024), + (256, 1408, 2048), ] NUM_EXPERTS = [8, 64] -EP_SIZE = [1, 4] TOP_KS = [1, 2, 6] +DTYPES = [torch.float8_e4m3fn, torch.bfloat16] vllm_config = VllmConfig() vllm_config.scheduler_config.max_num_seqs = 128 @@ -169,9 +173,11 @@ def test_fused_moe_batched_experts( with set_current_vllm_config(vllm_config): topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) - baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids) + baseline_output = torch_experts(a, w1, w2, topk_weight, + topk_ids) # only for baseline torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids) - batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids) + batched_output = naive_batched_moe( + a, w1, w2, topk_weight, topk_ids) # pick torch_experts or this torch.testing.assert_close(baseline_output, torch_output, @@ -183,6 +189,63 @@ def test_fused_moe_batched_experts( rtol=0) +def create_pplx_prepare_finalize( + num_tokens: int, + hidden_dim: int, + topk: int, + num_experts: int, + rank: int, + dp_size: int, + world_size: int, + in_dtype: torch.dtype, + quant_dtype: Optional[torch.dtype], + block_shape: Optional[list[int]], + per_act_token_quant: bool, + group_name: Optional[str], +): + from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( + PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes) + + max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1) + num_local_experts = rank_chunk(num_experts, 0, world_size) + + hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes( + max_num_tokens, + hidden_dim, + in_dtype, + quant_dtype, + per_act_token_quant=per_act_token_quant, + block_shape=block_shape, + ) + + args = dict( + max_num_tokens=max_num_tokens, + num_experts=num_experts, + experts_per_token=topk, + rank=rank, + world_size=world_size, + dp_size=dp_size, + hidden_dim=hidden_dim, + hidden_dim_bytes=hidden_dim_bytes, + hidden_dim_scale_bytes=scale_bytes, + ) + + if group_name is None: + ata = AllToAll.internode(**args) + else: + args["group_name"] = group_name + ata = AllToAll.intranode(**args) + + prepare_finalize = PplxPrepareAndFinalize( + ata, + max_num_tokens=max_num_tokens, + num_local_experts=num_local_experts, + num_dispatchers=world_size // dp_size, + ) + + return prepare_finalize, ata + + def rank_chunk(num: int, r: int, w: int) -> int: rem = num % w return (num // w) + (1 if r < rem else 0) @@ -193,6 +256,35 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor: return t[(r * chunk):(r + 1) * chunk] +def maybe_chunk_by_rank(t: Optional[torch.Tensor], r: int, + w: int) -> Optional[torch.Tensor]: + if t is not None: + return chunk_by_rank(t, r, w) + else: + return t + + +def chunk_scales_by_rank(t: Optional[torch.Tensor], r: int, + w: int) -> Optional[torch.Tensor]: + if t is not None and t.numel() > 1: + chunk = rank_chunk(t.shape[0], r, w) + return t[(r * chunk):(r + 1) * chunk] + else: + return t + + +def chunk_scales(t: Optional[torch.Tensor], start: int, + end: int) -> Optional[torch.Tensor]: + if t is not None and t.numel() > 1: + return t[start:end] + else: + return t + + +def dummy_work(a: torch.Tensor) -> torch.Tensor: + return a * 1.1 + + def pplx_prepare_finalize( pgi: ProcessGroupInfo, dp_size: int, @@ -200,11 +292,11 @@ def pplx_prepare_finalize( topk_weight: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, + quant_dtype: Optional[torch.dtype], + block_shape: Optional[list[int]], + per_act_token_quant: bool, group_name: Optional[str], ) -> torch.Tensor: - from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( - PplxPrepareAndFinalize) - assert torch.cuda.current_device() == pgi.local_rank topk = topk_ids.shape[1] @@ -212,60 +304,66 @@ def pplx_prepare_finalize( device = pgi.device rank = pgi.rank world_size = pgi.world_size - max_num_tokens = rank_chunk(num_tokens, 0, world_size) - - args = dict( - max_num_tokens=max_num_tokens, - num_experts=num_experts, - experts_per_token=topk, - rank=rank, - world_size=world_size, - dp_size=dp_size, - hidden_dim=hidden_dim, - hidden_dim_bytes=hidden_dim * a.dtype.itemsize, - hidden_dim_scale_bytes=0, - ) - - if group_name is None: - ata = AllToAll.internode(**args) - else: - args["group_name"] = group_name - ata = AllToAll.intranode(**args) topk_ids = topk_ids.to(dtype=torch.uint32) - prepare_finalize = PplxPrepareAndFinalize( - ata, - max_num_tokens, - world_size, + prepare_finalize, ata = create_pplx_prepare_finalize( + num_tokens, + hidden_dim, + topk, + num_experts, rank, dp_size, + world_size, + a.dtype, + quant_dtype, + block_shape, + per_act_token_quant, + group_name, ) + assert a.shape[0] == topk_ids.shape[0] + a_chunk = chunk_by_rank(a, rank, world_size).to(device) chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device) chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device) + assert a_chunk.shape[0] == chunk_topk_ids.shape[0] + + out = torch.full( + a_chunk.shape, + torch.nan, + dtype=a.dtype, + device=device, + ) + + if (quant_dtype is not None and not per_act_token_quant + and block_shape is None): + a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32) + a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32) + else: + a1_scale = None + a2_scale = None + b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare( a_chunk, - None, - None, + a1_scale, + a2_scale, chunk_topk_weight, chunk_topk_ids, num_experts, None, False, - FusedMoEQuantConfig(), + FusedMoEQuantConfig( + quant_dtype, + per_act_token_quant, + False, + block_shape, + ), ) - b_a = b_a * 1.5 - - out = torch.full( - (max_num_tokens, hidden_dim), - torch.nan, - dtype=a.dtype, - device=device, - ) + b_a = dummy_work( + dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype)) prepare_finalize.finalize( out, @@ -291,70 +389,96 @@ def _pplx_prepare_finalize( score: torch.Tensor, topk: torch.Tensor, num_experts: int, + quant_dtype: Optional[torch.dtype], + block_shape: Optional[list[int]], + per_act_token_quant: bool, use_internode: bool, ): - if use_internode: - uid = nvshmem_get_unique_id( - ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() - torch.distributed.broadcast(uid, src=0) - nvshmem_init(uid, pgi.rank, pgi.world_size) - group_name = None - else: - group_ranks = list(range(pgi.world_size)) - cpu_group = torch.distributed.new_group(group_ranks, backend="gloo") - group_name = cpu_group.group_name - - device = pgi.device + try: + if use_internode: + uid = nvshmem_get_unique_id( + ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() + torch.distributed.broadcast(uid, src=0) + nvshmem_init(uid, pgi.rank, pgi.world_size) + group_name = None + else: + group_ranks = list(range(pgi.world_size)) + cpu_group = torch.distributed.new_group(group_ranks, + backend="gloo") + group_name = cpu_group.group_name - topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) - k = a.shape[1] - - a_rep = torch.repeat_interleave(a, topk, dim=0).to(device) + topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) + m, k = a.shape - torch_output = (a_rep.view(-1, topk, k) * 1.5 * - topk_weight.view(-1, topk, 1).to(device)).sum(dim=1).to( - a.dtype) + a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0) - pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids, - num_experts, group_name) + torch_output = (a_rep.view(m, topk, k) * + topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum( + dim=1) - torch_output = chunk_by_rank(torch_output, pgi.rank, - pgi.world_size).to(pplx_output.device) + pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, + topk_ids, num_experts, quant_dtype, + block_shape, per_act_token_quant, + group_name) - torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0) + torch_output = chunk_by_rank(torch_output, pgi.rank, + pgi.world_size).to(pgi.device) - if use_internode: - nvshmem_finalize() + torch.testing.assert_close(pplx_output, + torch_output, + atol=3e-2, + rtol=3e-2) + finally: + if use_internode: + nvshmem_finalize() -# TODO (bnell): this test point does not work for odd M due to how the test is -# written, not due to limitations of the pplx kernels. The pplx_moe -# test below is able to deal with odd M. -# TODO (bnell) add fp8 tests -@pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS) +@pytest.mark.parametrize("mnk", PPLX_COMBOS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) -@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("world_dp_size", [[2, 1]]) +@pytest.mark.parametrize("per_act_token_quant", [False, True]) +@pytest.mark.parametrize("block_shape", [None, [128, 128]]) @pytest.mark.parametrize("use_internode", [False]) +@pytest.mark.optional @requires_pplx -def test_pplx_prepare_finalize( +def test_pplx_prepare_finalize_slow( mnk: tuple[int, int, int], e: int, topk: int, dtype: torch.dtype, world_dp_size: tuple[int, int], + per_act_token_quant: bool, + block_shape: Optional[list[int]], use_internode: bool, ): + if dtype == torch.float8_e4m3fn: + use_fp8_w8a8 = True + act_dtype = torch.bfloat16 + quant_dtype = dtype + else: + use_fp8_w8a8 = False + act_dtype = dtype + quant_dtype = None + + if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None): + pytest.skip("Skip quantization test for non-quantized type") + + if per_act_token_quant and block_shape is not None: + pytest.skip("Skip illegal quantization combination") + current_platform.seed_everything(7) m, n, k = mnk world_size, dp_size = world_dp_size device = "cuda" - a = torch.randn((m, k), device=device, dtype=dtype) / 10 - score = torch.randn((m, e), device=device, dtype=dtype) + + a = torch.randn((m, k), device=device, dtype=act_dtype) / 10 + score = torch.randn((m, e), device=device, dtype=act_dtype) parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score, - topk, e, use_internode) + topk, e, quant_dtype, block_shape, per_act_token_quant, + use_internode) def pplx_moe( @@ -369,84 +493,62 @@ def pplx_moe( topk_ids: torch.Tensor, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, - qtype: Optional[torch.dtype] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + quant_dtype: Optional[torch.dtype] = None, per_act_token_quant=False, block_shape: Optional[list[int]] = None, use_compile: bool = False, use_cudagraphs: bool = True, ) -> torch.Tensor: - from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( - PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes) - device = torch.device("cuda", rank) - hidden_dim = a.shape[1] + num_tokens, hidden_dim = a.shape num_experts = w1.shape[0] topk = topk_ids.shape[1] - max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 64) + max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 16) - hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes( - max_num_tokens, + prepare_finalize, ata = create_pplx_prepare_finalize( + num_tokens, hidden_dim, + topk, + num_experts, + rank, + dp_size, + world_size, a.dtype, - qtype, - per_act_token_quant=per_act_token_quant, - block_shape=block_shape, + quant_dtype, + block_shape, + per_act_token_quant, + group_name, ) - args = dict( - max_num_tokens=max_num_tokens, - num_experts=num_experts, - experts_per_token=topk, - rank=rank, - world_size=world_size, - dp_size=dp_size, - hidden_dim=hidden_dim, - hidden_dim_bytes=hidden_dim_bytes, - hidden_dim_scale_bytes=scale_bytes, - ) - - if group_name is None: - ata = AllToAll.internode(**args) - else: - args["group_name"] = group_name - ata = AllToAll.intranode(**args) - topk_ids = topk_ids.to(dtype=torch.uint32) - prepare_finalize = PplxPrepareAndFinalize( - ata, - max_num_tokens, - world_size, - rank, - dp_size, + experts = BatchedTritonExperts( + max_num_tokens=max_num_tokens, + num_dispatchers=prepare_finalize.num_dispatchers(), + use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn, + block_shape=block_shape, + per_act_token_quant=per_act_token_quant, ) - experts = BatchedTritonExperts(max_num_tokens=max_num_tokens, - world_size=world_size, - dp_size=dp_size, - use_fp8_w8a8=qtype == torch.float8_e4m3fn, - block_shape=block_shape) - fused_experts = FusedMoEModularKernel( prepare_finalize, experts, ) # Note: workers with the same dp_rank must use the exact same inputs. - a_chunk = chunk_by_rank(a, rank, world_size).to(device) - chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device) - chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device) + a_chunk = chunk_by_rank(a, rank, world_size) + chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size) + chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size) # Chunking weights like this only works for batched format - w1_chunk = chunk_by_rank(w1, rank, world_size).to(device) - w2_chunk = chunk_by_rank(w2, rank, world_size).to(device) - - if w1_scale is not None: - w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device) - w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device) - else: - w1_scale_chunk = None - w2_scale_chunk = None + w1_chunk = chunk_by_rank(w1, rank, world_size) + w2_chunk = chunk_by_rank(w2, rank, world_size) + w1_scale_chunk = maybe_chunk_by_rank(w1_scale, rank, world_size) + w2_scale_chunk = maybe_chunk_by_rank(w2_scale, rank, world_size) + a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size) + a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size) # Note: for now use_compile will error out if the problem size is # large enough to trigger chunking. I'm leaving the flag and @@ -468,6 +570,8 @@ def pplx_moe( chunk_topk_ids, w1_scale=w1_scale_chunk, w2_scale=w2_scale_chunk, + a1_scale=a1_scale_chunk, + a2_scale=a2_scale_chunk, global_num_experts=num_experts) if use_cudagraphs: @@ -482,6 +586,8 @@ def pplx_moe( chunk_topk_ids, w1_scale=w1_scale_chunk, w2_scale=w2_scale_chunk, + a1_scale=a1_scale_chunk, + a2_scale=a2_scale_chunk, global_num_experts=num_experts) torch.cuda.synchronize() @@ -494,48 +600,6 @@ def pplx_moe( return out -def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids): - assert torch.cuda.current_device() == pgi.local_rank - - num_experts = w1.shape[0] - device = pgi.device - rank = pgi.rank - world_size = pgi.world_size - max_num_tokens = rank_chunk(a.shape[0], 0, world_size) - - prepare_finalize = BatchedPrepareAndFinalize( - max_num_tokens=max_num_tokens, - world_size=world_size, - dp_size=dp_size, - rank=rank, - ) - - experts = NaiveBatchedExperts(max_num_tokens=a.shape[0], - world_size=1, - dp_size=1) - - fused_experts = FusedMoEModularKernel( - prepare_finalize, - experts, - ) - - # Note: workers with the same dp_rank must use the exact same inputs. - a_chunk = chunk_by_rank(a, rank, world_size).to(device) - chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device) - chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device) - - out = fused_experts( - a_chunk, - # Chunking weights like this only works for batched format - chunk_by_rank(w1, rank, world_size).to(device), - chunk_by_rank(w2, rank, world_size).to(device), - chunk_topk_weight, - chunk_topk_ids, - global_num_experts=num_experts) - - return out - - def _pplx_moe( pgi: ProcessGroupInfo, dp_size: int, @@ -544,75 +608,130 @@ def _pplx_moe( w2: torch.Tensor, score: torch.Tensor, topk: int, + num_experts: int, w1_s: Optional[torch.Tensor] = None, w2_s: Optional[torch.Tensor] = None, - qtype: Optional[torch.dtype] = None, + quant_dtype: Optional[torch.dtype] = None, per_act_token_quant: bool = False, block_shape: Optional[list[int]] = None, use_internode: bool = False, ): - if use_internode: - uid = nvshmem_get_unique_id( - ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() - torch.distributed.broadcast(uid, src=0) - nvshmem_init(uid, pgi.rank, pgi.world_size) - group_name = None - else: - group_ranks = list(range(pgi.world_size)) - cpu_group = torch.distributed.new_group(group_ranks, backend="gloo") - group_name = cpu_group.group_name - - m, k = a.shape - e, _, n = w2.shape - - moe_config = get_default_config(m, e, n, k, topk, a.dtype, False) - - device = torch.device("cuda", pgi.rank) - a = a.to(device) - w1 = w1.to(device) - w2 = w2.to(device) - w1_s = w1_s.to(device) if w1_s is not None else None - w2_s = w2_s.to(device) if w2_s is not None else None - - with set_current_vllm_config(vllm_config), override_config(moe_config): - topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) - torch_output = torch_experts(a, - w1, - w2, - topk_weight, - topk_ids, - w1_scale=w1_s, - w2_scale=w2_s, - quant_dtype=qtype, - per_act_token_quant=per_act_token_quant, - block_shape=block_shape) - pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size, - a, w1, w2, topk_weight, topk_ids, w1_s, w2_s, - qtype, per_act_token_quant, block_shape) - # TODO (bnell): fix + re-enable - #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, - # topk_ids) - - torch_output = chunk_by_rank(torch_output, pgi.rank, - pgi.world_size).to(pplx_output.device) - - torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0) - #torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0) - - if use_internode: - nvshmem_finalize() - - -@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS) + try: + if use_internode: + uid = nvshmem_get_unique_id( + ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() + torch.distributed.broadcast(uid, src=0) + nvshmem_init(uid, pgi.rank, pgi.world_size) + group_name = None + else: + group_ranks = list(range(pgi.world_size)) + cpu_group = torch.distributed.new_group(group_ranks, + backend="gloo") + group_name = cpu_group.group_name + + m, k = a.shape + e, _, n = w2.shape + + moe_config = get_default_config(m, e, n, k, topk, a.dtype, False) + + device = torch.device("cuda", pgi.rank) + rank = pgi.rank + world_size = pgi.world_size + + a = a.to(device) + w1 = w1.to(device) + w2 = w2.to(device) + w1_s = w1_s.to(device) if w1_s is not None else None + w2_s = w2_s.to(device) if w2_s is not None else None + + if (quant_dtype is not None and not per_act_token_quant + and block_shape is None): + a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32) + a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32) + else: + a1_scale = None + a2_scale = None + + with set_current_vllm_config(vllm_config), override_config(moe_config): + topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) + + torch_output = torch_experts( + a, + w1, + w2, + topk_weight, + topk_ids, + w1_scale=w1_s, + w2_scale=w2_s, + a1_scale=a1_scale, + a2_scale=a2_scale, + quant_dtype=quant_dtype, + per_act_token_quant=per_act_token_quant, + block_shape=block_shape, + ) + + batched_output = naive_batched_moe( + a, + w1, + w2, + topk_weight, + topk_ids, + w1_scale=w1_s, + w2_scale=w2_s, + a1_scale=a1_scale, + a2_scale=a2_scale, + quant_dtype=quant_dtype, + per_act_token_quant=per_act_token_quant, + block_shape=block_shape, + ) + + pplx_output = pplx_moe( + group_name, + rank, + world_size, + dp_size, + a, + w1, + w2, + topk_weight, + topk_ids, + w1_scale=w1_s, + w2_scale=w2_s, + a1_scale=a1_scale, + a2_scale=a2_scale, + quant_dtype=quant_dtype, + per_act_token_quant=per_act_token_quant, + block_shape=block_shape, + ) + + chunked_batch_output = chunk_by_rank( + batched_output, pgi.rank, pgi.world_size).to(pplx_output.device) + + torch.testing.assert_close(batched_output, + torch_output, + atol=3e-2, + rtol=3e-2) + + torch.testing.assert_close(pplx_output, + chunked_batch_output, + atol=3e-2, + rtol=3e-2) + finally: + if use_internode: + nvshmem_finalize() + + +@pytest.mark.parametrize("mnk", PPLX_COMBOS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) -@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("world_dp_size", [[2, 1]]) @pytest.mark.parametrize("per_act_token_quant", [False, True]) @pytest.mark.parametrize("block_shape", [None, [128, 128]]) @pytest.mark.parametrize("use_internode", [False]) +@pytest.mark.optional @requires_pplx -def test_pplx_moe( +def test_pplx_moe_slow( mnk: tuple[int, int, int], e: int, topk: int, @@ -633,18 +752,143 @@ def test_pplx_moe( use_fp8_w8a8 = False quant_dtype = None - if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None: + if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None): pytest.skip("Skip quantization test for non-quantized type") + if per_act_token_quant and block_shape is not None: + pytest.skip("Skip illegal quantization combination") + a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10 score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16) - _, w1, w1_s, _, w2, w2_s = make_test_weights(e, - n, - k, - quant_dtype=quant_dtype, - block_shape=block_shape) + _, w1, w1_s, _, w2, w2_s = make_test_weights( + e, + n, + k, + quant_dtype=quant_dtype, + block_shape=block_shape, + per_act_token_quant=per_act_token_quant, + ) - parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, + parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e, w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape, use_internode) + + +def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool, + make_weights: bool, test_fn: Callable): + + def format_result(msg, ex=None): + if ex is not None: + x = str(ex) + newx = x.strip(" \n\t")[:16] + if len(newx) < len(x): + newx = newx + " ..." + + prefix = "E\t" + print(f"{textwrap.indent(traceback.format_exc(), prefix)}") + print(f"FAILED {msg} - {newx}\n") + else: + print(f"PASSED {msg}") + + current_platform.seed_everything(7) + combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, + [False, True], [None, [128, 128]]) + exceptions = [] + count = 0 + for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos: + count = count + 1 + m, n, k = mnk + + if dtype == torch.float8_e4m3fn: + use_fp8_w8a8 = True + quant_dtype = dtype + else: + use_fp8_w8a8 = False + quant_dtype = None + + test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, " + f"dtype={dtype}, per_act_token={per_act_token_quant}, " + f"block_shape={block_shape}") + + if not use_fp8_w8a8 and (per_act_token_quant + or block_shape is not None): + print( + f"{test_desc} - Skip quantization test for non-quantized type." + ) + continue + + if per_act_token_quant and block_shape is not None: + print(f"{test_desc} - Skip illegal quantization combination.") + continue + + a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10 + score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16) + + args = dict() + if make_weights: + _, w1, w1_s, _, w2, w2_s = make_test_weights( + e, + n, + k, + quant_dtype=quant_dtype, + block_shape=block_shape, + per_act_token_quant=per_act_token_quant, + ) + args["w1"] = w1 + args["w2"] = w2 + args["w1_s"] = w1_s + args["w2_s"] = w2_s + + try: + test_fn( + pgi=pgi, + dp_size=dp_size, + a=a, + score=score, + topk=topk, + num_experts=e, + quant_dtype=quant_dtype, + per_act_token_quant=per_act_token_quant, + block_shape=block_shape, + use_internode=use_internode, + **args, + ) + format_result(test_desc) + except Exception as ex: + format_result(test_desc, ex) + exceptions.append(ex) + + if len(exceptions) > 0: + raise RuntimeError( + f"{len(exceptions)} of {count} tests failed in child process, " + f"rank={pgi.rank}.") + else: + print(f"{count} of {count} tests passed in child process, " + f"rank={pgi.rank}.") + + +@pytest.mark.parametrize("world_dp_size", [[2, 1]]) +@pytest.mark.parametrize("use_internode", [False]) +@requires_pplx +def test_pplx_prepare_finalize( + world_dp_size: tuple[int, int], + use_internode: bool, +): + current_platform.seed_everything(7) + world_size, dp_size = world_dp_size + parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size, + use_internode, False, _pplx_prepare_finalize) + + +@pytest.mark.parametrize("world_dp_size", [[2, 1]]) +@pytest.mark.parametrize("use_internode", [False]) +@requires_pplx +def test_pplx_moe( + world_dp_size: tuple[int, int], + use_internode: bool, +): + current_platform.seed_everything(7) + world_size, dp_size = world_dp_size + parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True, + _pplx_moe) diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 5b1048797447..df89ad7e6da6 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -63,13 +63,12 @@ def batched_moe( fused_experts = FusedMoEModularKernel( BatchedPrepareAndFinalize(max_num_tokens, - world_size=1, - dp_size=1, + num_dispatchers=1, + num_local_experts=w1.shape[0], rank=0), BatchedTritonExperts( max_num_tokens=max_num_tokens, - world_size=1, - dp_size=1, + num_dispatchers=1, use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn, per_act_token_quant=per_act_token_quant, block_shape=block_shape, @@ -105,13 +104,12 @@ def naive_batched_moe( fused_experts = FusedMoEModularKernel( BatchedPrepareAndFinalize(max_num_tokens, - world_size=1, - dp_size=1, + num_dispatchers=1, + num_local_experts=w1.shape[0], rank=0), NaiveBatchedExperts( max_num_tokens=max_num_tokens, - dp_size=1, - world_size=1, + num_dispatchers=1, use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn, per_act_token_quant=per_act_token_quant, block_shape=block_shape, diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index d0dc85f25755..6f43d1111c98 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -277,6 +277,24 @@ def dequant( return t.to(out_dtype) +def batched_dequant( + t: torch.Tensor, + scale: Optional[torch.Tensor], + block_shape: Optional[list[int]], + per_act_token_quant: bool, + out_dtype: Optional[torch.dtype] = torch.float32, +) -> torch.Tensor: + if scale is not None: + assert t.shape[0] == scale.shape[0] + out = torch.empty_like(t, dtype=out_dtype) + for e in range(t.shape[0]): + out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant, + out_dtype) + return out + + return t.to(out_dtype) + + def native_batched_masked_quant_matmul( A: torch.Tensor, B: torch.Tensor, diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py index 9115949a1651..90380b872d6c 100644 --- a/tests/kernels/test_apply_repetition_penalties.py +++ b/tests/kernels/test_apply_repetition_penalties.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch @@ -74,3 +75,51 @@ def test_apply_repetition_penalties( # Test the operator by applying the opcheck utility opcheck(torch.ops._C.apply_repetition_penalties_, (logits.clone(), prompt_mask, output_mask, repetition_penalties)) + + +@pytest.mark.skipif(not current_platform.is_cuda(), + reason="This test for checking CUDA kernel") +@torch.inference_mode() +def test_apply_repetition_penalties_zero_seqs() -> None: + """ + Test the apply_repetition_penalties custom op with num_seqs=0 + against a reference implementation. + """ + num_seqs = 0 + vocab_size = 17 + repetition_penalty = 1.05 + dtype = torch.float32 + seed = 0 + + current_platform.seed_everything(seed) + torch.set_default_device("cuda:0") + + # Create test data + logits = torch.randn(num_seqs, vocab_size, dtype=dtype) + + # Create masks with some random tokens marked as repeated + prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool) + output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool) + + # No tokens to mark as repeated since num_seqs=0 + + # Create repetition penalties tensor + repetition_penalties = torch.full((num_seqs, ), + repetition_penalty, + dtype=dtype) + + # Run all three implementations + logits_torch = logits.clone() + logits_cuda = logits.clone() + + apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask, + repetition_penalties) + apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask, + repetition_penalties) + + # Compare all outputs to reference + torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3) + + # Test the operator by applying the opcheck utility + opcheck(torch.ops._C.apply_repetition_penalties_, + (logits.clone(), prompt_mask, output_mask, repetition_penalties)) diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py index 74d29e79d96c..e25556c89fb9 100644 --- a/tests/kernels/test_flex_attention.py +++ b/tests/kernels/test_flex_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Integration tests for FlexAttention backend vs default backend""" import random diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 84cf87d71d88..fcaa93762856 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -1094,6 +1094,8 @@ def torch_experts( if expert_map is not None: topk_ids = expert_map[topk_ids] + f32 = torch.float32 + for i in range(num_experts): mask = topk_ids == i if mask.sum(): @@ -1109,7 +1111,8 @@ def torch_experts( out.dtype) tmp2 = SiluAndMul()(tmp1) tmp2, b_scale = moe_kernel_quantize_input( - tmp2, None, quant_dtype, per_act_token_quant, block_shape) + tmp2, a2_scale, quant_dtype, per_act_token_quant, + block_shape) out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale, w2_scale[i], block_shape, @@ -1117,7 +1120,6 @@ def torch_experts( else: assert (a_scale is not None and w1_scale is not None and w2_scale is not None) - f32 = torch.float32 scales = a_scale if a_scale.numel() == 1 else a_scale[mask] tmp1 = a[mask].to(f32) * scales w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1) @@ -1126,8 +1128,8 @@ def torch_experts( w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1) out[mask] = (tmp2 @ w2_dq).to(out.dtype) - return (out.view(M, -1, w2.shape[1]) * - topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1) + return (out.view(M, -1, w2.shape[1]).to(f32) * + topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype) def torch_moe(a: torch.Tensor, diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 4908f9a060f7..881d5efa6919 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -249,23 +249,6 @@ def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings): model_runner.model) -@pytest.fixture(params=[True, False]) -def run_with_both_engines_lora(request, monkeypatch): - # Automatically runs tests twice, once with V1 and once without - use_v1 = request.param - # Tests decorated with `@skip_v1` are only run without v1 - skip_v1 = request.node.get_closest_marker("skip_v1") - - if use_v1: - if skip_v1: - pytest.skip("Skipping test on vllm V1") - monkeypatch.setenv('VLLM_USE_V1', '1') - else: - monkeypatch.setenv('VLLM_USE_V1', '0') - - yield - - @pytest.fixture def reset_default_device(): """ diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 8aba68829b10..7d7a62eec118 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -39,7 +39,7 @@ [ pytest.param( "bigscience/bloom-560m", # bloom - testing alibi slopes - marks=[pytest.mark.core_model], + marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), pytest.param( "openai-community/gpt2", # gpt2 @@ -87,11 +87,7 @@ pytest.param("bigcode/starcoder2-3b"), # starcoder2 pytest.param( "TitanML/tiny-mixtral", # mixtral - marks=[pytest.mark.core_model], - ), - pytest.param( - "Qwen/Qwen1.5-MoE-A2.7B-Chat", - marks=[pytest.mark.cpu_model], + marks=[pytest.mark.core_model, pytest.mark.cpu_model], ) ]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index e6dd6c35e64d..ecaae3ec1fc4 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -3,6 +3,7 @@ import pytest +from tests.models.registry import HF_EXAMPLE_MODELS from tests.utils import multi_gpu_test from vllm.engine.arg_utils import EngineArgs from vllm.sampling_params import SamplingParams @@ -19,31 +20,55 @@ SSM_MODELS = [ "state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev", - # TODO: Compare to a Mamba2 model. The HF transformers implementation of - # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test - # doesn't compare vLLM output with HF output. - # See https://github.com/huggingface/transformers/pull/35943 "mistralai/Mamba-Codestral-7B-v0.1", ] HYBRID_MODELS = [ "ai21labs/Jamba-tiny-dev", - # NOTE: Currently the test failes due to HF transformers issue fixed in: - # https://github.com/huggingface/transformers/pull/39033 - # We will enable vLLM test for Granite after next HF transformers release. - # "ibm-granite/granite-4.0-tiny-preview", # NOTE: Running Plamo2 in transformers implementation requires to install # causal-conv1d package, which is not listed as a test dependency as it's # not compatible with pip-compile. "pfnet/plamo-2-1b", "Zyphra/Zamba2-1.2B-instruct", "hmellor/tiny-random-BambaForCausalLM", + "ibm-ai-platform/Bamba-9B-v1", + "nvidia/Nemotron-H-8B-Base-8K", + "ibm-granite/granite-4.0-tiny-preview", + "tiiuae/Falcon-H1-0.5B-Base", +] + +HF_UNSUPPORTED_MODELS = [ + # The HF transformers implementation of + # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test + # doesn't compare vLLM output with HF output. + # See https://github.com/huggingface/transformers/pull/35943 + "mistralai/Mamba-Codestral-7B-v0.1", + # Note: I'm not seeing the same output from vLLM V0 vs. HF transformers + # for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1 + "nvidia/Nemotron-H-8B-Base-8K", + # NOTE: Currently the test fails due to HF transformers issue fixed in: + # https://github.com/huggingface/transformers/pull/39033 + # We will enable vLLM test for Granite after next HF transformers release. + "ibm-granite/granite-4.0-tiny-preview", ] V1_SUPPORTED_MODELS = [ "mistralai/Mamba-Codestral-7B-v0.1", + "ibm-ai-platform/Bamba-9B-v1", + "Zyphra/Zamba2-1.2B-instruct", + "nvidia/Nemotron-H-8B-Base-8K", + "ibm-granite/granite-4.0-tiny-preview", + "tiiuae/Falcon-H1-0.5B-Base", ] +ATTN_BLOCK_SIZES = { + "ibm-ai-platform/Bamba-9B-v1": 528, + "Zyphra/Zamba2-1.2B-instruct": 80, + "nvidia/Nemotron-H-8B-Base-8K": 528, + "ibm-granite/granite-4.0-tiny-preview": 400, + "tiiuae/Falcon-H1-0.5B-Base": 800, +} + # Avoid OOM MAX_NUM_SEQS = 4 @@ -60,8 +85,16 @@ def test_models( max_tokens: int, num_logprobs: int, ) -> None: + + try: + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") + except ValueError: + pass + with hf_runner(model) as hf_model: - if model != "mistralai/Mamba-Codestral-7B-v0.1": + if model not in HF_UNSUPPORTED_MODELS: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) else: @@ -72,12 +105,21 @@ def test_models( example_prompts, max_tokens, num_logprobs) if model in V1_SUPPORTED_MODELS: + if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES: + block_size = ATTN_BLOCK_SIZES[model] + else: + block_size = 16 + with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") + if model in HYBRID_MODELS: + # required due to reorder_batch behaviour + m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, enforce_eager=True, - enable_prefix_caching=False) as vllm_model: + enable_prefix_caching=False, + block_size=block_size) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) else: @@ -111,6 +153,14 @@ def test_batching( max_tokens: int, num_logprobs: int, ) -> None: + + try: + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") + except ValueError: + pass + for_loop_outputs = [] with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: for prompt in example_prompts: diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 4b5c77d16fe2..05fcf4101ff9 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from typing import Optional import pytest @@ -27,10 +29,8 @@ def v1(run_with_both_engines): # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]), - pytest.param( - "intfloat/e5-mistral-7b-instruct", - # CPU v1 doesn't support sliding window - marks=[pytest.mark.core_model]), + pytest.param("intfloat/e5-mistral-7b-instruct", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), # the qwen models interfere with each other (see PR # https://github.com/vllm-project/vllm/pull/18720). # To avoid this problem, for now we skip v0 since it will be @@ -38,13 +38,11 @@ def v1(run_with_both_engines): pytest.param("ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]), # [Encoder-only] - pytest.param( - "BAAI/bge-base-en-v1.5", - marks=[ - # CPU only supports V1 - pytest.mark.core_model, - pytest.mark.skip_v1 - ]), + pytest.param("BAAI/bge-base-en-v1.5", + marks=[ + pytest.mark.core_model, pytest.mark.cpu_model, + pytest.mark.skip_v1 + ]), pytest.param("sentence-transformers/all-MiniLM-L12-v2", marks=[pytest.mark.skip_v1]), pytest.param("intfloat/multilingual-e5-small", @@ -63,6 +61,10 @@ def test_models( model, monkeypatch, ) -> None: + if model == "intfloat/e5-mistral-7b-instruct" and current_platform.is_cpu( + ) and os.environ.get("VLLM_USE_V1", "0") == "1": + pytest.skip("CPU V1 doesn't support sliding window") + if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm(): # ROCm Triton FA does not currently support sliding window attention # switch to use ROCm CK FA backend @@ -73,6 +75,13 @@ def test_models( vllm_extra_kwargs["override_pooler_config"] = \ PoolerConfig(pooling_type="MEAN", normalize=False) + max_model_len: Optional[int] = 512 + if model in [ + "sentence-transformers/all-MiniLM-L12-v2", + "sentence-transformers/stsb-roberta-base-v2" + ]: + max_model_len = None + # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" # sentence_transformers will strip the input texts, see: @@ -86,7 +95,7 @@ def test_models( with vllm_runner(model, task="embed", - max_model_len=512, + max_model_len=max_model_len, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.embed(example_prompts) diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 6a3a0f150b6d..0ad54785308e 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -56,10 +56,16 @@ enable_test=False), ] +V1FlashAttentionImpNotSupported = [ + "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base" +] + @pytest.mark.parametrize("model_info", MODELS) -def test_embed_models_mteb(hf_runner, vllm_runner, - model_info: EmbedModelInfo) -> None: +def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo, + monkeypatch) -> None: + if model_info.name in V1FlashAttentionImpNotSupported: + monkeypatch.setenv("VLLM_USE_V1", "0") vllm_extra_kwargs: dict[str, Any] = {} if model_info.architecture == "GteNewModel": @@ -71,8 +77,10 @@ def test_embed_models_mteb(hf_runner, vllm_runner, @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_correctness(hf_runner, vllm_runner, - model_info: EmbedModelInfo, - example_prompts) -> None: + model_info: EmbedModelInfo, example_prompts, + monkeypatch) -> None: + if model_info.name in V1FlashAttentionImpNotSupported: + monkeypatch.setenv("VLLM_USE_V1", "0") vllm_extra_kwargs: dict[str, Any] = {} if model_info.architecture == "GteNewModel": diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py index b6e83857fa70..d899aaada262 100644 --- a/tests/models/language/pooling/test_intfloat.py +++ b/tests/models/language/pooling/test_intfloat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from ...utils import EmbedModelInfo diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py new file mode 100644 index 000000000000..a1293a95bfd5 --- /dev/null +++ b/tests/models/language/pooling/test_mxbai_rerank.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +import pytest +import torch + +from tests.conftest import HfRunner + +from .mteb_utils import RerankModelInfo, mteb_test_rerank_models + +RERANK_MODELS = [ + RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2", + architecture="Qwen2ForSequenceClassification", + dtype="float32", + enable_test=True), + RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2", + architecture="Qwen2ForSequenceClassification", + dtype="float32", + enable_test=False) +] + + +class MxbaiRerankerHfRunner(HfRunner): + + def __init__(self, + model_name: str, + dtype: str = "auto", + *args: Any, + **kwargs: Any) -> None: + from transformers import AutoModelForCausalLM, AutoTokenizer + super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM) + + self.tokenizer = AutoTokenizer.from_pretrained(model_name, + padding_side='left') + self.yes_loc = self.tokenizer.convert_tokens_to_ids("1") + self.no_loc = self.tokenizer.convert_tokens_to_ids("0") + + def predict(self, prompts: list[list[str]], *args, + **kwargs) -> torch.Tensor: + + def process_inputs(pairs): + inputs = self.tokenizer(pairs, + padding=False, + truncation='longest_first', + return_attention_mask=False) + for i, ele in enumerate(inputs['input_ids']): + inputs['input_ids'][i] = ele + inputs = self.tokenizer.pad(inputs, + padding=True, + return_tensors="pt") + for key in inputs: + inputs[key] = inputs[key].to(self.model.device) + return inputs + + @torch.no_grad() + def compute_logits(inputs): + logits = self.model(**inputs).logits[:, -1, :] + yes_logits = logits[:, self.yes_loc] + no_logits = logits[:, self.no_loc] + logits = yes_logits - no_logits + scores = logits.float().sigmoid() + return scores + + scores = [] + for prompt in prompts: + inputs = process_inputs([prompt]) + score = compute_logits(inputs) + scores.append(score[0].item()) + return torch.Tensor(scores) + + +@pytest.mark.parametrize("model_info", RERANK_MODELS) +def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: + vllm_extra_kwargs: dict[str, Any] = {} + if model_info.architecture == "Qwen2ForSequenceClassification": + vllm_extra_kwargs["hf_overrides"] = { + "architectures": ["Qwen2ForSequenceClassification"], + "classifier_from_token": ["0", "1"], + "method": "from_2_way_softmax", + } + + mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info, + vllm_extra_kwargs) diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py index 3b7fab3ba5c9..ec3d25ee22a9 100644 --- a/tests/models/language/pooling/test_reward.py +++ b/tests/models/language/pooling/test_reward.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os - import pytest import torch import torch.nn.functional as F @@ -86,9 +84,6 @@ def test_prm_models( dtype: str, monkeypatch, ) -> None: - if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0": - pytest.skip("CPU only supports V1") - if current_platform.is_rocm(): # ROCm Triton FA does not currently support sliding window attention # switch to use ROCm CK FA backend diff --git a/tests/models/registry.py b/tests/models/registry.py index 704aa76b84d4..728c18643a00 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -169,7 +169,7 @@ def check_available_online( "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), - "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct", + "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base", min_transformers_version="4.53"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index db7e50eff72b..3646ad6c481b 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -45,8 +45,7 @@ def use_v0_only(monkeypatch): """ This module relies on V0 internals, so set VLLM_USE_V1=0. """ - if not current_platform.is_cpu(): - monkeypatch.setenv('VLLM_USE_V1', '0') + monkeypatch.setenv('VLLM_USE_V1', '0') @pytest.mark.parametrize( diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py index 04c1f98a709e..133b2d9e4df6 100644 --- a/tests/quantization/test_rtn.py +++ b/tests/quantization/test_rtn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright © 2025, Oracle and/or its affiliates. """Tests RTN quantization startup and generation, doesn't test correctness diff --git a/tests/test_utils.py b/tests/test_utils.py index 36db8202ba62..a165d2d7213a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -20,10 +20,11 @@ from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot, PlaceholderModule, StoreBoolean, bind_kv_cache, common_broadcastable_dtype, - deprecate_kwargs, get_open_port, is_lossless_cast, - make_zmq_path, make_zmq_socket, memory_profiling, - merge_async_iterators, sha256, split_zmq_path, - supports_kw, swap_dict_values) + deprecate_kwargs, get_open_port, get_tcp_uri, + is_lossless_cast, join_host_port, make_zmq_path, + make_zmq_socket, memory_profiling, + merge_async_iterators, sha256, split_host_port, + split_zmq_path, supports_kw, swap_dict_values) from .utils import create_new_process_for_each_test, error_on_warning @@ -876,3 +877,44 @@ def test_make_zmq_socket_ipv6(): def test_make_zmq_path(): assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555" assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555" + + +def test_get_tcp_uri(): + assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555" + assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555" + + +def test_split_host_port(): + # valid ipv4 + assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555) + # invalid ipv4 + with pytest.raises(ValueError): + # multi colon + assert split_host_port("127.0.0.1::5555") + with pytest.raises(ValueError): + # tailing colon + assert split_host_port("127.0.0.1:5555:") + with pytest.raises(ValueError): + # no colon + assert split_host_port("127.0.0.15555") + with pytest.raises(ValueError): + # none int port + assert split_host_port("127.0.0.1:5555a") + + # valid ipv6 + assert split_host_port("[::1]:5555") == ("::1", 5555) + # invalid ipv6 + with pytest.raises(ValueError): + # multi colon + assert split_host_port("[::1]::5555") + with pytest.raises(IndexError): + # no colon + assert split_host_port("[::1]5555") + with pytest.raises(ValueError): + # none int port + assert split_host_port("[::1]:5555a") + + +def test_join_host_port(): + assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555" + assert join_host_port("::1", 5555) == "[::1]:5555" diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py index 0c9a574e03dc..49b8e4b96f1b 100644 --- a/tests/tool_use/test_minimax_tool_parser.py +++ b/tests/tool_use/test_minimax_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: E501 import json diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py index dd154177bc8b..8d26b9051590 100644 --- a/tests/tool_use/test_xlam_tool_parser.py +++ b/tests/tool_use/test_xlam_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 652a556659fe..02d2c83ab158 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -9,7 +9,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange -from vllm.sampling_params import SamplingParams +from vllm.sampling_params import GuidedDecodingParams, SamplingParams from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, @@ -17,6 +17,7 @@ from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager +from vllm.v1.structured_output.request import StructuredOutputRequest EOS_TOKEN_ID = 50256 @@ -33,6 +34,7 @@ def create_scheduler( block_size: int = 16, max_model_len: Optional[int] = None, num_speculative_tokens: Optional[int] = None, + skip_tokenizer_init: bool = False, ) -> Scheduler: '''Create scheduler under test. @@ -65,6 +67,7 @@ def create_scheduler( trust_remote_code=True, dtype="float16", seed=42, + skip_tokenizer_init=skip_tokenizer_init, ) # Cache config, optionally force APC kwargs_cache = ({} if enable_prefix_caching is None else { @@ -186,7 +189,7 @@ def test_get_num_unfinished_requests(): ]) def test_schedule(enable_prefix_caching: Optional[bool], prompt_logprobs: Optional[int]): - '''Test scheduling. + '''Test scheduling. Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs ''' scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching) @@ -1408,7 +1411,7 @@ def create_requests_with_priority( def test_priority_scheduling_basic_ordering(): - """Test that requests are scheduled in priority order + """Test that requests are scheduled in priority order (lower value = higher priority).""" scheduler = create_scheduler_with_priority() @@ -1437,7 +1440,7 @@ def test_priority_scheduling_basic_ordering(): def test_priority_scheduling_arrival_time_tiebreaker(): - """Test that arrival time is used + """Test that arrival time is used as tiebreaker when priorities are equal.""" scheduler = create_scheduler_with_priority() @@ -1495,7 +1498,7 @@ def test_priority_scheduling_mixed_priority_and_arrival(): def test_priority_scheduling_preemption(): - """Test that priority scheduling preempts + """Test that priority scheduling preempts lower priority requests when memory is constrained.""" # Create scheduler with very limited memory to force preemption scheduler = create_scheduler_with_priority( @@ -1576,7 +1579,7 @@ def test_priority_scheduling_preemption(): def test_priority_scheduling_no_preemption_when_space_available(): - """Test that preemption doesn't happen + """Test that preemption doesn't happen when there's space for new requests.""" scheduler = create_scheduler_with_priority( max_num_seqs=3, # Allow 3 concurrent requests @@ -1626,7 +1629,7 @@ def test_priority_scheduling_no_preemption_when_space_available(): def test_priority_scheduling_preemption_victim_selection(): - """Test that the correct victim is selected for + """Test that the correct victim is selected for preemption based on priority and arrival time.""" # This test verifies the priority-based victim selection logic # by checking the waiting queue order after adding requests with different @@ -1743,7 +1746,7 @@ def test_priority_scheduling_waiting_queue_order(): def test_priority_scheduling_fcfs_fallback(): - """Test that FCFS behavior is maintained when all + """Test that FCFS behavior is maintained when all requests have same priority.""" scheduler = create_scheduler_with_priority() @@ -1811,7 +1814,7 @@ def test_priority_scheduling_with_limited_slots(): def test_priority_scheduling_heap_property(): - """Test that the waiting queue maintains heap + """Test that the waiting queue maintains heap property for priority scheduling.""" scheduler = create_scheduler_with_priority( max_num_seqs=1, # Only one request can run at a time @@ -1857,3 +1860,39 @@ def test_priority_scheduling_heap_property(): # Verify requests were scheduled in priority order (lowest value first) expected_priorities = sorted(priorities) assert scheduled_priorities == expected_priorities + + +def test_schedule_skip_tokenizer_init(): + scheduler = create_scheduler(skip_tokenizer_init=True) + requests = create_requests(num_requests=5) + for request in requests: + scheduler.add_request(request) + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert output.grammar_bitmask is None + + +def test_schedule_skip_tokenizer_init_structured_output_request(): + scheduler = create_scheduler(skip_tokenizer_init=True) + guided_params = GuidedDecodingParams(regex="[0-9]+") + sampling_params = SamplingParams( + ignore_eos=False, + max_tokens=16, + guided_decoding=guided_params, + ) + request = Request( + request_id="0", + prompt_token_ids=[0, 1], + multi_modal_inputs=None, + multi_modal_hashes=None, + multi_modal_placeholders=None, + sampling_params=sampling_params, + pooling_params=None, + eos_token_id=EOS_TOKEN_ID, + structured_output_request=StructuredOutputRequest(sampling_params), + ) + scheduler.add_request(request) + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == 0 + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 1 diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index 6284dcfb915b..059106c62a20 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -1,19 +1,30 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations import random -from typing import Optional +from typing import TYPE_CHECKING, Optional import pytest -from vllm import LLM, SamplingParams +from vllm import LLM +from vllm.sampling_params import GuidedDecodingParams, SamplingParams from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector +if TYPE_CHECKING: + from tests.conftest import VllmRunner + MODEL = "facebook/opt-125m" DTYPE = "half" -def _vllm_model(apc: bool, vllm_runner, monkeypatch): +def _vllm_model( + apc: bool, + vllm_runner: type[VllmRunner], + monkeypatch: pytest.MonkeyPatch, + *, + skip_tokenizer_init: bool = False, +): """Set up VllmRunner instance.""" monkeypatch.setenv("VLLM_USE_V1", "1") return vllm_runner( @@ -23,6 +34,7 @@ def _vllm_model(apc: bool, vllm_runner, monkeypatch): enforce_eager=True, enable_prefix_caching=apc, gpu_memory_utilization=0.5, + skip_tokenizer_init=skip_tokenizer_init, ) @@ -45,9 +57,27 @@ def vllm_model_apc(vllm_runner, monkeypatch): yield vllm_model +@pytest.fixture( + # Function scope decouples tests & allows + # env var adjustment via monkeypatch + scope="function", + # Prefix caching + params=[False, True]) +def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch): + """VllmRunner test fixture with APC.""" + with _vllm_model( + request.param, + vllm_runner, + monkeypatch, + skip_tokenizer_init=True, + ) as vllm_model: + yield vllm_model + + def _get_test_sampling_params( prompt_list: list[str], seed: Optional[int] = 42, + structured_outputs: bool = False, ) -> tuple[list[SamplingParams], list[int]]: """Generate random sampling params for a batch.""" @@ -62,14 +92,34 @@ def get_mostly_n_gt1() -> int: n_list = [get_mostly_n_gt1() for _ in range(len(prompt_list))] # High temperature to maximize the chance of unique completions return [ - SamplingParams(temperature=0.95, top_p=0.95, n=n, seed=seed) - for n in n_list + SamplingParams( + temperature=0.95, + top_p=0.95, + n=n, + seed=seed, + guided_decoding=GuidedDecodingParams( + regex="[0-9]+") if structured_outputs else None, + ) for n in n_list ], n_list +def test_compatibility_with_skip_tokenizer_init( + vllm_model_skip_tokenizer_init: VllmRunner, + example_prompts: list[str], +): + # Case 1: Structured output request should raise an error. + sampling_params_list, _ = _get_test_sampling_params( + example_prompts, + structured_outputs=True, + ) + model: LLM = vllm_model_skip_tokenizer_init.model + with pytest.raises(ValueError): + _ = model.generate(example_prompts, sampling_params_list) + + def test_parallel_sampling(vllm_model, example_prompts) -> None: """Test passes if parallel sampling `n>1` yields `n` unique completions. - + Args: vllm_model: VllmRunner instance under test. example_prompt: test fixture providing prompts for testing. diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py index a8e230a97ed5..84ee3b0392b4 100644 --- a/tests/v1/sample/test_logits_processors.py +++ b/tests/v1/sample/test_logits_processors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from collections.abc import Callable diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index d640d7dc49d1..7a7ba346a719 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -13,7 +13,6 @@ "openai/whisper-large-v3", # transcription "facebook/bart-large-cnn", # encoder decoder "state-spaces/mamba-130m-hf", # mamba1 - "hmellor/tiny-random-BambaForCausalLM", # hybrid "BAAI/bge-m3", # embedding ] diff --git a/tests/v1/test_request.py b/tests/v1/test_request.py index 2dc90f83caba..fb835747cfc6 100644 --- a/tests/v1/test_request.py +++ b/tests/v1/test_request.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.v1.request import RequestStatus diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py index 916325e41b92..ad234df0c8ed 100644 --- a/tests/v1/tpu/test_spmd_model_weight_loading.py +++ b/tests/v1/tpu/test_spmd_model_weight_loading.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gc import tempfile diff --git a/tests/v1/tpu/test_tpu_qkv_linear.py b/tests/v1/tpu/test_tpu_qkv_linear.py index b98570f01a7f..46fa1193881f 100644 --- a/tests/v1/tpu/test_tpu_qkv_linear.py +++ b/tests/v1/tpu/test_tpu_qkv_linear.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import tempfile import numpy as np diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 2e1deecbd9e6..d13df553db62 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -450,6 +450,7 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2): def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(): + torch.set_default_dtype(torch.float16) layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" error_msg = f"{layer_1} must come before the current layer" @@ -478,6 +479,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(): def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(): + torch.set_default_dtype(torch.float16) layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" invalid_layer = "model.layers.0.cross_attn.attn" @@ -506,6 +508,7 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(): def test_init_kv_cache_with_kv_sharing_target_same_as_current(): + torch.set_default_dtype(torch.float16) layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" error_msg = f"{layer_1} cannot be the same as the current layer" @@ -534,6 +537,7 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current(): def test_init_kv_cache_without_kv_sharing(): + torch.set_default_dtype(torch.float16) layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" vllm_config = get_vllm_config() @@ -601,6 +605,7 @@ def test_init_kv_cache_without_kv_sharing(): def test_init_kv_cache_with_kv_sharing_valid(): + torch.set_default_dtype(torch.float16) layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" vllm_config = get_vllm_config() diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py index b212df400b25..ef197d1fbace 100644 --- a/tools/check_pickle_imports.py +++ b/tools/check_pickle_imports.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py index 92914186b16e..ced10ba9097b 100644 --- a/tools/check_spdx_header.py +++ b/tools/check_spdx_header.py @@ -2,51 +2,146 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import sys +from enum import Enum -SPDX_HEADER = ( + +class SPDXStatus(Enum): + """SPDX header status enumeration""" + EMPTY = "empty" # empty __init__.py + COMPLETE = "complete" + MISSING_LICENSE = "missing_license" # Only has copyright line + MISSING_COPYRIGHT = "missing_copyright" # Only has license line + MISSING_BOTH = "missing_both" # Completely missing + + +FULL_SPDX_HEADER = ( "# SPDX-License-Identifier: Apache-2.0\n" "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project") -SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:" +LICENSE_LINE = "# SPDX-License-Identifier: Apache-2.0" +COPYRIGHT_LINE = "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project" # noqa: E501 -def check_spdx_header(file_path): - with open(file_path, encoding='UTF-8') as file: + +def check_spdx_header_status(file_path): + """Check SPDX header status of the file""" + with open(file_path, encoding="UTF-8") as file: lines = file.readlines() if not lines: - # Empty file like __init__.py - return True - for line in lines: - if line.strip().startswith(SPDX_HEADER_PREFIX): - return True - return False + # Empty file + return SPDXStatus.EMPTY + + # Skip shebang line + start_idx = 0 + if lines and lines[0].startswith("#!"): + start_idx = 1 + has_license = False + has_copyright = False -def add_header(file_path): - with open(file_path, 'r+', encoding='UTF-8') as file: + # Check all lines for SPDX headers (not just the first two) + for i in range(start_idx, len(lines)): + line = lines[i].strip() + if line == LICENSE_LINE: + has_license = True + elif line == COPYRIGHT_LINE: + has_copyright = True + + # Determine status based on what we found + if has_license and has_copyright: + return SPDXStatus.COMPLETE + elif has_license and not has_copyright: + # Only has license line + return SPDXStatus.MISSING_COPYRIGHT + # Only has copyright line + elif not has_license and has_copyright: + return SPDXStatus.MISSING_LICENSE + else: + # Completely missing both lines + return SPDXStatus.MISSING_BOTH + + +def add_header(file_path, status): + """Add or supplement SPDX header based on status""" + with open(file_path, "r+", encoding="UTF-8") as file: lines = file.readlines() file.seek(0, 0) - if lines and lines[0].startswith("#!"): - file.write(lines[0]) - file.write(SPDX_HEADER + '\n') - file.writelines(lines[1:]) - else: - file.write(SPDX_HEADER + '\n') + file.truncate() + + if status == SPDXStatus.MISSING_BOTH: + # Completely missing, add complete header + if lines and lines[0].startswith("#!"): + # Preserve shebang line + file.write(lines[0]) + file.write(FULL_SPDX_HEADER + "\n") + file.writelines(lines[1:]) + else: + # Add header directly + file.write(FULL_SPDX_HEADER + "\n") + file.writelines(lines) + + elif status == SPDXStatus.MISSING_COPYRIGHT: + # Only has license line, need to add copyright line + # Find the license line and add copyright line after it + for i, line in enumerate(lines): + if line.strip() == LICENSE_LINE: + # Insert copyright line after license line + lines.insert( + i + 1, + f"{COPYRIGHT_LINE}\n", + ) + break + + file.writelines(lines) + + elif status == SPDXStatus.MISSING_LICENSE: + # Only has copyright line, need to add license line + # Find the copyright line and add license line before it + for i, line in enumerate(lines): + if line.strip() == COPYRIGHT_LINE: + # Insert license line before copyright line + lines.insert(i, f"{LICENSE_LINE}\n") + break file.writelines(lines) def main(): - files_with_missing_header = [] + """Main function""" + files_missing_both = [] + files_missing_copyright = [] + files_missing_license = [] + for file_path in sys.argv[1:]: - if not check_spdx_header(file_path): - files_with_missing_header.append(file_path) + status = check_spdx_header_status(file_path) - if files_with_missing_header: + if status == SPDXStatus.MISSING_BOTH: + files_missing_both.append(file_path) + elif status == SPDXStatus.MISSING_COPYRIGHT: + files_missing_copyright.append(file_path) + elif status == SPDXStatus.MISSING_LICENSE: + files_missing_license.append(file_path) + else: + continue + + # Collect all files that need fixing + all_files_to_fix = (files_missing_both + files_missing_copyright + + files_missing_license) + if all_files_to_fix: print("The following files are missing the SPDX header:") - for file_path in files_with_missing_header: - print(f" {file_path}") - add_header(file_path) + if files_missing_both: + for file_path in files_missing_both: + print(f" {file_path}") + add_header(file_path, SPDXStatus.MISSING_BOTH) + + if files_missing_copyright: + for file_path in files_missing_copyright: + print(f" {file_path}") + add_header(file_path, SPDXStatus.MISSING_COPYRIGHT) + if files_missing_license: + for file_path in files_missing_license: + print(f" {file_path}") + add_header(file_path, SPDXStatus.MISSING_LICENSE) - sys.exit(1 if files_with_missing_header else 0) + sys.exit(1 if all_files_to_fix else 0) if __name__ == "__main__": diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 6b1b3f787c23..eb9d0b405892 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -646,6 +646,20 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool: return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability) +def cutlass_blockwise_scaled_grouped_mm( + output: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + scales_a: torch.Tensor, + scales_b: torch.Tensor, + problem_sizes: torch.Tensor, + expert_offsets: torch.Tensor, +): + torch.ops._C.cutlass_blockwise_scaled_grouped_mm(output, a, b, scales_a, + scales_b, problem_sizes, + expert_offsets) + + def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor, block_scale_a: torch.Tensor, block_scale_b: torch.Tensor, alpha: torch.Tensor, diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py new file mode 100644 index 000000000000..a490aa397991 --- /dev/null +++ b/vllm/attention/backends/torch_sdpa.py @@ -0,0 +1,546 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" Attention layer with torch scaled_dot_product_attention + and PagedAttention.""" +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +import torch +from torch.nn.functional import scaled_dot_product_attention + +# yapf conflicts with isort for this block +# yapf: disable +from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer, + AttentionMetadata, AttentionType, + is_quantized_kv_cache) +# yapf: enable +from vllm.attention.ops.ipex_attn import PagedAttention, _use_ipex +from vllm.attention.ops.paged_attn import PagedAttentionMetadata +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@dataclass +class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): + """Metadata for TorchSDPABackend. + """ + # Currently, input sequences can only contain all prompts + # or all decoding. True if all sequences are prompts. + chunked_prefill: bool + seq_lens: Optional[List[int]] = None # For non-chunked prefill + + # For chunked prefill only + max_query_len: Optional[int] = None + max_kv_len: Optional[int] = None + prefill_query_start_loc: Optional[torch.Tensor] = None + kv_start_loc: Optional[torch.Tensor] = None + prefill_block_tables: Optional[torch.Tensor] = None + + # For V1 logits index only + query_start_loc: Optional[torch.Tensor] = None + + # Begin encoder attn & enc/dec cross-attn fields... + # Encoder sequence lengths representation + encoder_seq_lens: Optional[List[int]] = None + encoder_seq_lens_tensor: Optional[torch.Tensor] = None + + # Maximum sequence length among encoder sequences + max_encoder_seq_len: Optional[int] = None + + # Number of tokens input to encoder + num_encoder_tokens: Optional[int] = None + + # Cross-attention memory-mapping data structures: slot mapping + # and block tables + cross_slot_mapping: Optional[torch.Tensor] = None + cross_block_tables: Optional[torch.Tensor] = None + + def __post_init__(self): + # Set during the execution of the first attention op. + # It is a list because it is needed to set per prompt + # when alibi slopes is used. It is because of the limitation + # from xformer API. + # will not appear in the __repr__ and __init__ + self.attn_bias: Optional[List[torch.Tensor]] = None + self.encoder_attn_bias: Optional[List[torch.Tensor]] = None + self.cross_attn_bias: Optional[List[torch.Tensor]] = None + + @property + def is_all_encoder_attn_metadata_set(self): + ''' + All attention metadata required for encoder attention is set. + ''' + return ((self.encoder_seq_lens is not None) + and (self.encoder_seq_lens_tensor is not None) + and (self.max_encoder_seq_len is not None)) + + @property + def is_all_cross_attn_metadata_set(self): + ''' + All attention metadata required for enc/dec cross-attention is set. + + Superset of encoder attention required metadata. + ''' + return (self.is_all_encoder_attn_metadata_set + and (self.cross_slot_mapping is not None) + and (self.cross_block_tables is not None)) + + @property + def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]: + if self.num_prefill_tokens == 0: + return None + return self + + @property + def decode_metadata(self) -> Optional["TorchSDPAMetadata"]: + if self.num_decode_tokens == 0: + return None + return self + + def get_seq_lens( + self, + attn_type: str, + ): + ''' + Extract appropriate sequence lengths from attention metadata + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + * Appropriate sequence lengths tensor for query + * Appropriate sequence lengths tensor for key & value + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + seq_lens_q = self.seq_lens + seq_lens_kv = self.seq_lens + elif attn_type == AttentionType.ENCODER: + seq_lens_q = self.encoder_seq_lens + seq_lens_kv = self.encoder_seq_lens + elif attn_type == AttentionType.ENCODER_DECODER: + seq_lens_q = self.seq_lens + seq_lens_kv = self.encoder_seq_lens + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + return seq_lens_q, seq_lens_kv + + def get_attn_bias( + self, + attn_type: str, + ) -> Optional[List[torch.Tensor]]: + ''' + Extract appropriate attention bias from attention metadata + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + * Appropriate attention bias value given the attention type + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + return self.attn_bias + elif attn_type == AttentionType.ENCODER: + return self.encoder_attn_bias + elif attn_type == AttentionType.ENCODER_DECODER: + return self.cross_attn_bias + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + def set_attn_bias( + self, + attn_bias: List[torch.Tensor], + attn_type: str, + ) -> None: + ''' + Update appropriate attention bias field of attention metadata, + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_bias: The desired attention bias value + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + self.attn_bias = attn_bias + elif attn_type == AttentionType.ENCODER: + self.encoder_attn_bias = attn_bias + elif attn_type == AttentionType.ENCODER_DECODER: + self.cross_attn_bias = attn_bias + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + def get_seq_len_block_table_args( + self, + attn_type: str, + ) -> tuple: + ''' + The particular choice of sequence-length- and block-table-related + attributes which should be extracted from attn_metadata is dependent + on the type of attention operation. + + Decoder attn -> select entirely decoder self-attention-related fields + Encoder/decoder cross-attn -> select encoder sequence lengths & + cross-attn block-tables fields + Encoder attn -> select encoder sequence lengths fields & no block tables + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * is_prompt: True if prefill, False otherwise + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + + * Appropriate sequence-lengths tensor + * Appropriate max sequence-length scalar + * Appropriate block tables (or None) + ''' + + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): + # Decoder self-attention + # Choose max_seq_len based on whether we are in prompt_run + return (self.seq_lens_tensor, self.max_decode_seq_len, + self.block_tables) + elif attn_type == AttentionType.ENCODER_DECODER: + # Enc/dec cross-attention KVs match encoder sequence length; + # cross-attention utilizes special "cross" block tables + return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, + self.cross_block_tables) + elif attn_type == AttentionType.ENCODER: + # No block tables associated with encoder attention + return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, + None) + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + +class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + use_irope: bool = False, + ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") + if blocksparse_params is not None: + raise ValueError( + "Torch SPDA does not support block-sparse attention.") + if logits_soft_cap is not None: + logger.warning_once("Torch SPDA does not support logits soft cap. " + "Outputs may be slightly off.") + if use_irope: + logger.warning_once( + "Using irope in Torch SPDA is not supported yet, it will fall" + " back to global attention for long context.") + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + self.sliding_window = sliding_window + self.kv_cache_dtype = kv_cache_dtype + + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.need_mask = (self.alibi_slopes is not None + or self.sliding_window is not None) + + supported_head_sizes = PagedAttention.get_supported_head_sizes() + if head_size not in supported_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {supported_head_sizes}.") + + if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex: + raise NotImplementedError( + "Torch SDPA backend FP8 KV cache requires " + "intel_extension_for_pytorch support.") + self.attn_type = attn_type + + def forward( + self, + layer: AttentionLayer, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: TorchSDPAMetadata, # type: ignore + output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Forward pass with torch SDPA and PagedAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + NOTE: kv_cache will be an empty tensor with shape [0] + for profiling run. + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for TorchSDPABackendImpl") + + # For warming-up + if attn_metadata is None: + return query + + attn_type = self.attn_type + if (attn_type == AttentionType.ENCODER + and (not attn_metadata.is_all_encoder_attn_metadata_set)): + raise AttributeError("Encoder attention requires setting " + "encoder metadata attributes.") + elif (attn_type == AttentionType.ENCODER_DECODER + and (not attn_metadata.is_all_cross_attn_metadata_set)): + raise AttributeError("Encoder/decoder cross-attention " + "requires setting cross-attention " + "metadata attributes.") + + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + if key is not None: + assert value is not None + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + else: + assert value is None + + if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0): + # KV-cache during decoder-self- or + # encoder-decoder-cross-attention, but not + # during encoder attention. + # + # Even if there are no new key/value pairs to cache, + # we still need to break out key_cache and value_cache + # i.e. for later use by paged attention + key_cache, value_cache = PagedAttention.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + + if (key is not None) and (value is not None): + if attn_type == AttentionType.ENCODER_DECODER: + # Update cross-attention KV cache (prefill-only) + # During cross-attention decode, key & value will be None, + # preventing this IF-statement branch from running + updated_slot_mapping = attn_metadata.cross_slot_mapping + else: + # Update self-attention KV cache (prefill/decode) + updated_slot_mapping = attn_metadata.slot_mapping + + PagedAttention.write_to_paged_cache( + key, value, key_cache, value_cache, updated_slot_mapping, + self.kv_cache_dtype, layer._k_scale, layer._v_scale) + + if attn_type != AttentionType.ENCODER: + # Decoder self-attention supports chunked prefill. + # Encoder/decoder cross-attention requires no chunked + # prefill (100% prefill or 100% decode tokens, no mix) + num_prefill_tokens = attn_metadata.num_prefill_tokens + num_decode_tokens = attn_metadata.num_decode_tokens + else: + # Encoder attention - chunked prefill is not applicable; + # derive token-count from query shape & and treat them + # as 100% prefill tokens + assert attn_metadata.num_encoder_tokens is not None + num_prefill_tokens = attn_metadata.num_encoder_tokens + num_decode_tokens = 0 + + if attn_type == AttentionType.DECODER: + # Only enforce this shape-constraint for decoder + # self-attention + assert key.shape[0] == num_prefill_tokens + num_decode_tokens + assert value.shape[0] == num_prefill_tokens + num_decode_tokens + + output = torch.empty_like(query) + if prefill_meta := attn_metadata.prefill_metadata: + if not prefill_meta.prefill_metadata.chunked_prefill: # type: ignore + assert attn_metadata.seq_lens is not None + self._run_sdpa_forward(output, + query, + key, + value, + prefill_meta, + attn_type=attn_type) + else: + # prefix-enabled attention + assert not self.need_mask + import intel_extension_for_pytorch.llm.modules as ipex_modules + output = torch.empty_like(query) + ipex_modules.PagedAttention.flash_attn_varlen_func( + output[:prefill_meta.num_prefill_tokens, :, :], + query[:prefill_meta.num_prefill_tokens, :, :], + key_cache, + value_cache, + prefill_meta.prefill_query_start_loc, + prefill_meta.kv_start_loc, + prefill_meta.max_query_len, + prefill_meta.max_kv_len, + self.scale, + True, + prefill_meta.prefill_block_tables, + self.alibi_slopes, + ) + + if decode_meta := attn_metadata.decode_metadata: + assert attn_type != AttentionType.ENCODER_ONLY, ( + "Encoder-only models should not have decode metadata.") + # Decoding run. + ( + seq_lens_arg, + max_seq_len_arg, + block_tables_arg, + ) = decode_meta.get_seq_len_block_table_args(attn_type) + + PagedAttention.forward_decode( + output[attn_metadata.num_prefill_tokens:, :, :], + query[attn_metadata.num_prefill_tokens:, :, :], + key_cache, + value_cache, + block_tables_arg, + seq_lens_arg, + max_seq_len_arg, + self.kv_cache_dtype, + self.num_kv_heads, + self.scale, + self.alibi_slopes, + layer._k_scale, + layer._v_scale, + ) + + # Reshape the output tensor. + return output.view(-1, self.num_heads * self.head_size) + + def _run_sdpa_forward( + self, + output: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_metadata: TorchSDPAMetadata, + attn_type: str = AttentionType.DECODER, + ) -> None: + if self.num_kv_heads != self.num_heads: + key = key.repeat_interleave(self.num_queries_per_kv, dim=1) + value = value.repeat_interleave(self.num_queries_per_kv, dim=1) + + attn_masks = attn_metadata.get_attn_bias(attn_type) + if attn_masks is None: + if self.alibi_slopes is not None: + attn_masks = _make_alibi_bias( + self.alibi_slopes, query.dtype, + attn_metadata.seq_lens) # type: ignore + elif self.sliding_window is not None: + assert attn_metadata.seq_lens is not None + attn_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, self.sliding_window, + query.dtype) # type: ignore + else: + seq_lens, _ = attn_metadata.get_seq_lens(attn_type) + attn_masks = [None] * len(seq_lens) + attn_metadata.set_attn_bias(attn_masks, attn_type) + + query = query.movedim(0, query.dim() - 2) + key = key.movedim(0, key.dim() - 2) + value = value.movedim(0, value.dim() - 2) + + causal_attn = (attn_type == AttentionType.DECODER) + + seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type) + start_q, start_kv = 0, 0 + for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv, + attn_masks): + end_q = start_q + seq_len_q + end_kv = start_kv + seq_len_kv + sub_out = scaled_dot_product_attention( + query[None, :, start_q:end_q, :], + key[None, :, start_kv:end_kv, :], + value[None, :, start_kv:end_kv, :], + attn_mask=mask, + dropout_p=0.0, + is_causal=causal_attn and mask is None, + scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0) + output[start_q:end_q, :, :] = sub_out + start_q, start_kv = end_q, end_kv + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + dtype: torch.dtype, + seq_lens: List[int], +) -> List[torch.Tensor]: + attn_biases: List[torch.Tensor] = [] + for seq_len in seq_lens: + bias = torch.arange(seq_len, dtype=dtype) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(seq_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + bias = bias[None, :] - bias[:, None] + + num_heads = alibi_slopes.shape[0] + bias = bias[None, :].repeat((num_heads, 1, 1)) + bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0) + inf_mask = torch.empty( + (1, seq_len, seq_len), + dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1) + attn_biases.append((bias + inf_mask).to(dtype)) + + return attn_biases + + +def _make_sliding_window_bias( + seq_lens: List[int], + window_size: Optional[int], + dtype: torch.dtype, +) -> List[torch.Tensor]: + attn_biases: List[torch.Tensor] = [] + for seq_len in seq_lens: + tensor = torch.full( + (1, seq_len, seq_len), + dtype=dtype, + fill_value=1, + ) + shift = 0 + mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore + if window_size is not None: + mask = torch.triu(mask, diagonal=shift - window_size + 1) + mask = torch.log(mask) + attn_biases.append(mask.to(dtype)) + + return attn_biases diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py new file mode 100644 index 000000000000..891975498916 --- /dev/null +++ b/vllm/attention/ops/ipex_attn.py @@ -0,0 +1,195 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import List, Optional, Tuple + +try: + import intel_extension_for_pytorch.llm.modules as ipex_modules + _use_ipex = True +# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813 +except (ImportError, AttributeError): + _use_ipex = False + +import torch + +from vllm import _custom_ops as ops + + +class _PagedAttention: + + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [32, 64, 80, 96, 112, 128, 192, 256] + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + *args, + ) -> Tuple[int, ...]: + return 2, num_blocks, block_size * num_kv_heads * head_size + + @staticmethod + def split_kv_cache( + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + *args, + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = 16 // kv_cache.element_size() + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, + -1, x) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) + return key_cache, value_cache + + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + *args, + ) -> None: + ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + slot_mapping.flatten(), + kv_cache_dtype, + k_scale, + v_scale, + ) + + @staticmethod + def forward_decode( + output: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + max_context_len: int, + kv_cache_dtype: str, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + k_scale: torch.Tensor, + v_scale: torch.Tensor, + *args, + ) -> None: + tp_rank: int = 0 + blocksparse_local_blocks: int = 0 + blocksparse_vert_stride: int = 0 + blocksparse_block_size: int = 64 + blocksparse_head_sliding_step: int = 0 + block_size = value_cache.shape[3] + + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + *args, + ) -> None: + key_caches = [kv_cache[0] for kv_cache in kv_caches] + value_caches = [kv_cache[1] for kv_cache in kv_caches] + ops.copy_blocks(key_caches, value_caches, src_to_dists) + + +class _IPEXPagedAttention(_PagedAttention): + + @staticmethod + def split_kv_cache( + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + *args, + ) -> Tuple[torch.Tensor, torch.Tensor]: + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size) + return key_cache, value_cache + + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + *args, + ) -> None: + ipex_modules.PagedAttention.reshape_and_cache( + key, value, key_cache, value_cache, + slot_mapping.flatten().int()) + + @staticmethod + def forward_decode( + output: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + max_context_len: int, + kv_cache_dtype: str, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + k_scale: torch.Tensor, + v_scale: torch.Tensor, + *args, + ) -> None: + block_size = value_cache.shape[2] + head_mapping = torch.arange( + 0, + num_kv_heads, + device="cpu", + dtype=torch.int32, + ).view(num_kv_heads, + 1).repeat_interleave(query.size(1) // num_kv_heads).flatten() + ipex_modules.PagedAttention.single_query_cached_kv_attention( + output, query.contiguous(), key_cache, value_cache, head_mapping, + scale, block_tables, context_lens, block_size, max_context_len, + alibi_slopes) + + +PagedAttention = _IPEXPagedAttention if _use_ipex else _PagedAttention diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index cf57e5ed282e..79518b6f4f96 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch._inductor.pattern_matcher as pm diff --git a/vllm/config.py b/vllm/config.py index 226a1014fa72..a1d8c32953b0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -466,6 +466,9 @@ def __post_init__(self) -> None: "affect the random state of the Python process that " "launched vLLM.", self.seed) + # Keep set served_model_name before maybe_model_redirect(self.model) + self.served_model_name = get_served_model_name(self.model, + self.served_model_name) self.model = maybe_model_redirect(self.model) # The tokenizer is consistent with the model by default. if self.tokenizer is None: @@ -609,8 +612,6 @@ def __post_init__(self) -> None: self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) - self.served_model_name = get_served_model_name(self.model, - self.served_model_name) self.multimodal_config = self._init_multimodal_config() if not self.skip_tokenizer_init: self._verify_tokenizer_mode() @@ -1420,7 +1421,7 @@ def is_multimodal_model(self) -> bool: @property def is_cross_encoder(self) -> bool: - return self.registry.is_cross_encoder_model(self.architectures) + return self.task == "classify" @property def use_mla(self) -> bool: @@ -4762,6 +4763,12 @@ def try_verify_and_update_config(self): if cls is not None: cls.verify_and_update_config(self) + if self.model_config.task == "classify": + # Maybe convert ForCausalLM into ForSequenceClassification model. + from vllm.model_executor.models.adapters import ( + SequenceClassificationConfig) + SequenceClassificationConfig.verify_and_update_config(self) + def __str__(self): return ( f"model={self.model_config.model!r}," diff --git a/vllm/distributed/eplb/__init__.py b/vllm/distributed/eplb/__init__.py index c87b039afd73..80511024b930 100644 --- a/vllm/distributed/eplb/__init__.py +++ b/vllm/distributed/eplb/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ''' Expert parallelism load balancer (EPLB). ''' diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 2185df865c1f..6b0a126ca9b2 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Expert parallelism load balancer (EPLB) metrics and states. diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py index 7ad6d566b55b..879b5b9f1824 100644 --- a/vllm/distributed/eplb/rebalance_algo.py +++ b/vllm/distributed/eplb/rebalance_algo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Expert parallelism load balancer (EPLB) for vLLM. diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py index cf173c734afd..2ef8587b559b 100644 --- a/vllm/distributed/eplb/rebalance_execute.py +++ b/vllm/distributed/eplb/rebalance_execute.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ The actual execution of the rearrangement. diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 493235d724f4..5cbc8ca31752 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -97,10 +97,10 @@ def get_kv_connector_cache_layout(): # used for faster transfer. vllm_config = get_current_vllm_config() kv_config = vllm_config.kv_transfer_config - if vllm_config.model_config is None or kv_config is None: + if kv_config is not None and vllm_config.model_config is None: logger.warning_once("Unable to detect current VLLM config. " \ "Defaulting to NHD kv cache layout.") - else: + elif kv_config is not None: use_mla = vllm_config.model_config.use_mla if not use_mla and kv_config.kv_connector == "NixlConnector": logger.info_once("NixlConnector detected. Setting KV cache " \ diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 2f870971ded7..52f589a6d718 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 35c26897fe3f..6c9ccb2e301e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging import os diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index 303619a3fdd0..02e3bc6274f6 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import atexit import ctypes diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py index 9f3494b8106e..0b560d1b3b3c 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py @@ -16,6 +16,7 @@ from vllm.config import KVTransferConfig from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase from vllm.logger import init_logger +from vllm.utils import join_host_port, make_zmq_path, split_host_port logger = init_logger(__name__) NONE_INT = -150886311 @@ -79,18 +80,19 @@ def __init__(self, kv_rank: int, local_rank: int): logger.error( "An error occurred while loading the configuration: %s", exc) raise - prefill_host, base_prefill_port = self.config.prefill_url.split(':') - decode_host, base_decode_port = self.config.decode_url.split(':') + prefill_host, base_prefill_port = split_host_port( + self.config.prefill_url) + decode_host, base_decode_port = split_host_port(self.config.decode_url) # Avoid ports conflict when running prefill and decode on the same node if prefill_host == decode_host and \ base_prefill_port == base_decode_port: - base_decode_port = str(int(base_decode_port) + 100) + base_decode_port = base_decode_port + 100 - prefill_port = int(base_prefill_port) + self.local_rank - decode_port = int(base_decode_port) + self.local_rank - self.prefill_url = ':'.join([prefill_host, str(prefill_port)]) - self.decode_url = ':'.join([decode_host, str(decode_port)]) + prefill_port = base_prefill_port + self.local_rank + decode_port = base_decode_port + self.local_rank + self.prefill_url = join_host_port(prefill_host, prefill_port) + self.decode_url = join_host_port(decode_host, decode_port) self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url, self.config.metadata_server, self.config.protocol, @@ -110,22 +112,30 @@ def __init__(self, kv_rank: int, local_rank: int): self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port, decode_host, base_decode_port) - def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: str, - d_host: str, d_port: str) -> None: + def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: int, + d_host: str, d_port: int) -> None: """Set up ZeroMQ sockets for sending and receiving data.""" # Offsets < 8 are left for initialization in case tp and pp are enabled - p_rank_offset = int(p_port) + 8 + self.local_rank * 2 - d_rank_offset = int(d_port) + 8 + self.local_rank * 2 + p_rank_offset = p_port + 8 + self.local_rank * 2 + d_rank_offset = d_port + 8 + self.local_rank * 2 if kv_rank == 0: - self.sender_socket.bind(f"tcp://{p_host}:{p_rank_offset + 1}") - self.receiver_socket.connect(f"tcp://{d_host}:{d_rank_offset + 1}") - self.sender_ack.connect(f"tcp://{d_host}:{d_rank_offset + 2}") - self.receiver_ack.bind(f"tcp://{p_host}:{p_rank_offset + 2}") + self.sender_socket.bind( + make_zmq_path("tcp", p_host, p_rank_offset + 1)) + self.receiver_socket.connect( + make_zmq_path("tcp", d_host, d_rank_offset + 1)) + self.sender_ack.connect( + make_zmq_path("tcp", d_host, d_rank_offset + 2)) + self.receiver_ack.bind( + make_zmq_path("tcp", p_host, p_rank_offset + 2)) else: - self.receiver_socket.connect(f"tcp://{p_host}:{p_rank_offset + 1}") - self.sender_socket.bind(f"tcp://{d_host}:{d_rank_offset + 1}") - self.receiver_ack.bind(f"tcp://{d_host}:{d_rank_offset + 2}") - self.sender_ack.connect(f"tcp://{p_host}:{p_rank_offset + 2}") + self.receiver_socket.connect( + make_zmq_path("tcp", p_host, p_rank_offset + 1)) + self.sender_socket.bind( + make_zmq_path("tcp", d_host, d_rank_offset + 1)) + self.receiver_ack.bind( + make_zmq_path("tcp", d_host, d_rank_offset + 2)) + self.sender_ack.connect( + make_zmq_path("tcp", p_host, p_rank_offset + 2)) def initialize(self, local_hostname: str, metadata_server: str, protocol: str, device_name: str, diff --git a/vllm/distributed/tpu_distributed_utils.py b/vllm/distributed/tpu_distributed_utils.py index 36ab2eb3a62f..0a786b4a1708 100644 --- a/vllm/distributed/tpu_distributed_utils.py +++ b/vllm/distributed/tpu_distributed_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import OrderedDict from typing import Optional diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 873d29aaf312..cf94b6a64281 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1393,15 +1393,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: recommend_to_remove=False) return False - # Only Fp16 and Bf16 dtypes since we only support FA. - V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16] - if current_platform.is_cpu(): - V1_SUPPORTED_DTYPES.append(torch.float32) - if model_config.dtype not in V1_SUPPORTED_DTYPES: - _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}", - recommend_to_remove=False) - return False - # No Mamba or Encoder-Decoder so far. if not model_config.is_v1_compatible: _raise_or_fallback(feature_name=model_config.architectures, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 1054b969cd3b..4b6c50526b10 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -28,7 +28,8 @@ ChatCompletionToolMessageParam) from openai.types.chat.chat_completion_content_part_input_audio_param import ( InputAudio) -from pydantic import TypeAdapter +from PIL import Image +from pydantic import BaseModel, ConfigDict, TypeAdapter # yapf: enable from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin) @@ -91,6 +92,25 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): """The type of the content part.""" +class PILImage(BaseModel): + """ + A PIL.Image.Image object. + """ + image_pil: Image.Image + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class CustomChatCompletionContentPILImageParam(TypedDict, total=False): + """A simpler version of the param that only accepts a PIL image. + + Example: + { + "image_pil": ImageAsset('cherry_blossom').pil_image + } + """ + image_pil: Required[PILImage] + + class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain image_url. This is supported by OpenAI API, although it is not documented. @@ -129,6 +149,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, ChatCompletionContentPartInputAudioParam, ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam, + CustomChatCompletionContentPILImageParam, CustomChatCompletionContentSimpleImageParam, ChatCompletionContentPartImageEmbedsParam, CustomChatCompletionContentSimpleAudioParam, @@ -631,6 +652,10 @@ def parse_image_embeds(self, image_embeds: Union[str, dict[str, str]]) -> None: raise NotImplementedError + @abstractmethod + def parse_image_pil(self, image_pil: Image.Image) -> None: + raise NotImplementedError + @abstractmethod def parse_audio(self, audio_url: str) -> None: raise NotImplementedError @@ -677,6 +702,10 @@ def parse_image_embeds(self, self._add_placeholder(placeholder) + def parse_image_pil(self, image_pil: Image.Image) -> None: + placeholder = self._tracker.add("image", image_pil) + self._add_placeholder(placeholder) + def parse_audio(self, audio_url: str) -> None: audio = self._connector.fetch_audio(audio_url) @@ -733,6 +762,13 @@ def parse_image_embeds(self, placeholder = self._tracker.add("image_embeds", future) self._add_placeholder(placeholder) + def parse_image_pil(self, image_pil: Image.Image) -> None: + future: asyncio.Future[Image.Image] = asyncio.Future() + future.set_result(image_pil) + + placeholder = self._tracker.add("image", future) + self._add_placeholder(placeholder) + def parse_audio(self, audio_url: str) -> None: audio_coro = self._connector.fetch_audio_async(audio_url) @@ -851,12 +887,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int], _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam) _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam) _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) +_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam) # Need to validate url objects _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python -_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio] +_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage] # Define a mapping from part types to their corresponding parsing functions. MM_PARSER_MAP: dict[ @@ -869,6 +906,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int], lambda part: _ImageParser(part).get("image_url", {}).get("url", None), "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None), + "image_pil": lambda part: _PILImageParser(part).get("image_pil", None), "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), "input_audio": @@ -938,7 +976,7 @@ def _parse_chat_message_content_mm_part( VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url", - "image_embeds", + "image_embeds", "image_pil", "audio_url", "input_audio", "video_url") @@ -1009,6 +1047,10 @@ def _parse_chat_message_content_part( else: return str_content + if part_type == "image_pil": + image_content = cast(Image.Image, content) + mm_parser.parse_image_pil(image_content) + return {'type': 'image'} if wrap_dicts else None if part_type == "image_url": str_content = cast(str, content) mm_parser.parse_image(str_content) diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py index a9e703cf8889..87fb9f351464 100644 --- a/vllm/entrypoints/cli/benchmark/main.py +++ b/vllm/entrypoints/cli/benchmark/main.py @@ -44,6 +44,7 @@ def subparser_init( cmd_cls.name, help=cmd_cls.help, description=cmd_cls.help, + usage=f"vllm bench {cmd_cls.name} [options]", ) cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd) cmd_cls.add_cli_args(cmd_subparser) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 917b3bbbb982..6c0a95ebb1ee 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -910,6 +910,8 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request): } if envs.VLLM_SERVER_DEV_MODE: + logger.warning("SECURITY WARNING: Development endpoints are enabled! " + "This should NOT be used in production!") @router.get("/server_info") async def show_server_info(raw_request: Request): diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 3df11db33384..93d9c588d8d2 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -229,7 +229,6 @@ class ChatCompletionRequest(OpenAIBaseModel): logit_bias: Optional[dict[str, float]] = None logprobs: Optional[bool] = False top_logprobs: Optional[int] = 0 - # TODO(#9845): remove max_tokens when field is removed from OpenAI API max_tokens: Optional[int] = Field( default=None, deprecated= @@ -433,23 +432,10 @@ class ChatCompletionRequest(OpenAIBaseModel): } def to_beam_search_params( - self, - default_max_tokens: int, - default_sampling_params: Optional[dict] = None - ) -> BeamSearchParams: - # TODO(#9845): remove max_tokens when field is removed from OpenAI API - max_tokens = self.max_completion_tokens or self.max_tokens + self, max_tokens: int, + default_sampling_params: dict) -> BeamSearchParams: - if default_sampling_params is None: - default_sampling_params = {} n = self.n if self.n is not None else 1 - - # Use minimum of context window, user request & server limit. - max_tokens = min( - val for val in (default_max_tokens, max_tokens, - default_sampling_params.get("max_tokens", None)) - if val is not None) - if (temperature := self.temperature) is None: temperature = default_sampling_params.get( "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]) @@ -465,21 +451,10 @@ def to_beam_search_params( def to_sampling_params( self, - default_max_tokens: int, + max_tokens: int, logits_processor_pattern: Optional[str], - default_sampling_params: Optional[dict] = None, + default_sampling_params: dict, ) -> SamplingParams: - # TODO(#9845): remove max_tokens when field is removed from OpenAI API - max_tokens = self.max_completion_tokens or self.max_tokens - - if default_sampling_params is None: - default_sampling_params = {} - - # Use minimum of context window, user request & server limit. - max_tokens = min( - val for val in (default_max_tokens, max_tokens, - default_sampling_params.get("max_tokens", None)) - if val is not None) # Default parameters if (repetition_penalty := self.repetition_penalty) is None: @@ -898,22 +873,15 @@ class CompletionRequest(OpenAIBaseModel): } def to_beam_search_params( - self, - default_max_tokens: int, - default_sampling_params: Optional[dict] = None + self, + max_tokens: int, + default_sampling_params: Optional[dict] = None, ) -> BeamSearchParams: - max_tokens = self.max_tokens if default_sampling_params is None: default_sampling_params = {} n = self.n if self.n is not None else 1 - # Use minimum of context window, user request & server limit. - max_tokens = min( - val for val in (default_max_tokens, max_tokens, - default_sampling_params.get("max_tokens", None)) - if val is not None) - if (temperature := self.temperature) is None: temperature = default_sampling_params.get("temperature", 1.0) @@ -928,21 +896,14 @@ def to_beam_search_params( def to_sampling_params( self, - default_max_tokens: int, + max_tokens: int, logits_processor_pattern: Optional[str], default_sampling_params: Optional[dict] = None, ) -> SamplingParams: - max_tokens = self.max_tokens if default_sampling_params is None: default_sampling_params = {} - # Use minimum of context window, user request & server limit. - max_tokens = min( - val for val in (default_max_tokens, max_tokens, - default_sampling_params.get("max_tokens", None)) - if val is not None) - # Default parameters if (repetition_penalty := self.repetition_penalty) is None: repetition_penalty = default_sampling_params.get( @@ -1813,7 +1774,7 @@ def to_sampling_params( self, default_max_tokens: int, default_sampling_params: Optional[dict] = None) -> SamplingParams: - # TODO(#9845): remove max_tokens when field is removed from OpenAI API + max_tokens = default_max_tokens if default_sampling_params is None: @@ -2029,7 +1990,7 @@ def to_sampling_params( self, default_max_tokens: int, default_sampling_params: Optional[dict] = None) -> SamplingParams: - # TODO(#9845): remove max_tokens when field is removed from OpenAI API + max_tokens = default_max_tokens if default_sampling_params is None: diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 299ade4e4d7d..a802fbc3865f 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -34,6 +34,7 @@ from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( MistralToolCall) +from vllm.entrypoints.utils import get_max_tokens from vllm.logger import init_logger from vllm.outputs import CompletionOutput, RequestOutput from vllm.reasoning import ReasoningParser, ReasoningParserManager @@ -233,15 +234,22 @@ async def create_chat_completion( try: for i, engine_prompt in enumerate(engine_prompts): sampling_params: Union[SamplingParams, BeamSearchParams] - default_max_tokens = self.max_model_len - len( - engine_prompt["prompt_token_ids"]) + + if self.default_sampling_params is None: + self.default_sampling_params = {} + + max_tokens = get_max_tokens( + max_model_len=self.max_model_len, + request=request, + input_length=len(engine_prompt["prompt_token_ids"]), + default_sampling_params=self.default_sampling_params) + if request.use_beam_search: sampling_params = request.to_beam_search_params( - default_max_tokens, self.default_sampling_params) + max_tokens, self.default_sampling_params) else: sampling_params = request.to_sampling_params( - default_max_tokens, - self.model_config.logits_processor_pattern, + max_tokens, self.model_config.logits_processor_pattern, self.default_sampling_params) self._log_inputs(request_id, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 8171b491aafc..6c9c29b71445 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -33,6 +33,7 @@ is_text_tokens_prompt) # yapf: enable from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.utils import get_max_tokens from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt, is_tokens_prompt) from vllm.logger import init_logger @@ -160,15 +161,22 @@ async def create_completion( input_length = len(engine_prompt["prompt_token_ids"]) else: assert_never(engine_prompt) - default_max_tokens = self.max_model_len - input_length + + if self.default_sampling_params is None: + self.default_sampling_params = {} + + max_tokens = get_max_tokens( + max_model_len=self.max_model_len, + request=request, + input_length=input_length, + default_sampling_params=self.default_sampling_params) if request.use_beam_search: sampling_params = request.to_beam_search_params( - default_max_tokens, self.default_sampling_params) + max_tokens, self.default_sampling_params) else: sampling_params = request.to_sampling_params( - default_max_tokens, - self.model_config.logits_processor_pattern, + max_tokens, self.model_config.logits_processor_pattern, self.default_sampling_params) request_id_item = f"{request_id}-{i}" diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py index 6dd8336e52de..9392e4f0e1dc 100644 --- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa import json from collections.abc import Sequence -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union import regex as re diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 5b085e5b7947..423b99dbe565 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -5,13 +5,17 @@ import asyncio import functools import os -from typing import Any, Optional +import sys +from typing import Any, Optional, Union from fastapi import Request from fastapi.responses import JSONResponse, StreamingResponse from starlette.background import BackgroundTask, BackgroundTasks +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + CompletionRequest) from vllm.logger import init_logger +from vllm.platforms import current_platform logger = init_logger(__name__) @@ -181,7 +185,6 @@ def _validate_truncation_size( def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser, subcommand_name: list[str]): - import sys # Only handle --help= for the current subcommand. # Since subparser_init() runs for all subcommands during CLI setup, @@ -242,3 +245,18 @@ def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser, print(f"\nNo group or parameter matching '{search_keyword}'") print("Tip: use `--help=listgroup` to view all groups.") sys.exit(1) + + +def get_max_tokens(max_model_len: int, request: Union[ChatCompletionRequest, + CompletionRequest], + input_length: int, default_sampling_params: dict) -> int: + + max_tokens = getattr(request, "max_completion_tokens", + None) or request.max_tokens + default_max_tokens = max_model_len - input_length + max_output_tokens = current_platform.get_max_output_tokens(input_length) + + return min(val + for val in (default_max_tokens, max_tokens, max_output_tokens, + default_sampling_params.get("max_tokens")) + if val is not None) diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index 6b08f32dff18..a8788e340fc8 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch @@ -184,15 +185,14 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__(self, max_num_tokens: int, - world_size: int, - dp_size: int, + num_dispatchers: int, block_shape: list[int], per_act_token_quant=False): """ max_num_tokens: Maximum number of tokens from a DP Rank - world_size: Number of EP ranks - dp_size: Number of data-parallel ranks - block_shape: Block quantization block shape + num_dispatchers: The number of DP dispatchers. + block_shape: Block quantization block shape. + per_act_token_quant: Per activation token quantization flag. """ super().__init__( FusedMoEQuantConfig( @@ -202,8 +202,7 @@ def __init__(self, )) assert self.block_shape == self.DEEPGEMM_BLOCK_SHAPE self.max_num_tokens = max_num_tokens - self.world_size = world_size - self.dp_size = dp_size + self.num_dispatchers = num_dispatchers @property def activation_formats( @@ -233,7 +232,7 @@ def workspace_shapes( # FIXME (varun): We should be able to dispatch only from the leader # DP ranks in the case of TP > 1. At the moment, all the Ranks # end up sending their tokens. This needs to be fixed. - num_dispatchers = self.world_size + num_dispatchers = self.num_dispatchers num_experts = local_num_experts max_num_tokens = a.size( 0) if self.max_num_tokens is None else self.max_num_tokens diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py index 3682a536cb5c..0d67b4a4a6d6 100644 --- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch @@ -15,8 +16,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__(self, max_num_tokens: int, - world_size: int, - dp_size: int, + num_dispatchers: int, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, use_int8_w8a16: bool = False, @@ -37,35 +37,28 @@ def __init__(self, block_shape=block_shape, per_act_token_quant=per_act_token_quant, )) - self.max_num_tokens = max_num_tokens - self.world_size = world_size - self.dp_size = dp_size self.allow_deep_gemm = allow_deep_gemm - # BatchedTritonKernel doesn't support block quantization - # at the moment. self.batched_triton_experts = BatchedTritonExperts( - max_num_tokens=self.max_num_tokens, - world_size=self.world_size, - dp_size=self.dp_size, + max_num_tokens=max_num_tokens, + num_dispatchers=num_dispatchers, use_fp8_w8a8=use_fp8_w8a8, use_int8_w8a8=use_int8_w8a8, use_int8_w8a16=use_int8_w8a16, use_int4_w4a16=use_int4_w4a16, per_act_token_quant=self.per_act_token_quant, block_shape=self.block_shape, - ) if self.block_shape is None else None + ) - is_fp8_128_block_quantized = ( - use_fp8_w8a8 and self.block_shape - == BatchedDeepGemmExperts.DEEPGEMM_BLOCK_SHAPE) + self.allow_deep_gemm = (allow_deep_gemm and use_fp8_w8a8 + and self.block_shape + == BatchedDeepGemmExperts.DEEPGEMM_BLOCK_SHAPE) self.batched_deep_gemm_experts = BatchedDeepGemmExperts( - max_num_tokens=self.max_num_tokens, - world_size=self.world_size, - dp_size=self.dp_size, + max_num_tokens=max_num_tokens, + num_dispatchers=num_dispatchers, block_shape=self.block_shape, # type: ignore[arg-type] - ) if (self.allow_deep_gemm and is_fp8_128_block_quantized) else None + ) if self.allow_deep_gemm else None assert (self.batched_deep_gemm_experts is not None or self.batched_triton_experts is not None) @@ -138,12 +131,8 @@ def apply( workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], ): - use_batched_deep_gemm_experts = (self.allow_deep_gemm - and self.batched_deep_gemm_experts - is not None) experts = (self.batched_deep_gemm_experts - if use_batched_deep_gemm_experts else - self.batched_triton_experts) + if self.allow_deep_gemm else self.batched_triton_experts) assert experts is not None experts.apply(output, hidden_states, w1, w2, topk_ids, activation, global_num_experts, expert_map, w1_scale, w2_scale, diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 9a678406b8f3..6c03732030d1 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -14,6 +14,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.utils import cdiv logger = init_logger(__name__) @@ -68,6 +69,57 @@ class FusedMoEQuantConfig: # TODO: add col major flag? # add detailed quant info for input, intermediates, weights, etc? + def __post_init__(self): + assert (not self.per_act_token_quant + or self.block_shape is None), "illegal quantization" + + @property + def is_quantized(self) -> bool: + return self.quant_dtype is not None + + @property + def is_per_act_token(self) -> bool: + return self.per_act_token_quant + + @property + def is_block_quantized(self) -> bool: + return self.block_shape is not None + + @property + def is_per_tensor(self) -> bool: + return not self.per_act_token_quant and self.block_shape is None + + def scale_shape( + self, + max_tokens: int, + hidden_dim: int, + ) -> Optional[tuple[int, int]]: + if self.is_quantized: + if self.is_block_quantized: + assert self.block_shape is not None + _, block_k = self.block_shape + k_tiles = cdiv(hidden_dim, block_k) + return (max_tokens, k_tiles) + elif self.is_per_act_token: + return (max_tokens, 1) + else: + return (1, 1) + else: + return None + + def batched_scale_shape( + self, + num_experts: int, + max_tokens: int, + hidden_dim: int, + ) -> Optional[tuple[int, int, int]]: + if self.is_quantized: + scale_shape = self.scale_shape(max_tokens, hidden_dim) + assert scale_shape is not None + return (num_experts, *scale_shape) + else: + return None + @staticmethod def make( use_fp8_w8a8: bool = False, @@ -109,7 +161,6 @@ class FusedMoEParallelConfig: tp_rank: int dp_rank: int ep_rank: int - world_size: int use_ep: bool # whether to use EP or not @@ -133,7 +184,7 @@ def use_deepep_ll_kernels(self): and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency") @staticmethod - def make(tp_size_: int, dp_size_: int, world_size_: int, + def make(tp_size_: int, dp_size_: int, vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig": """ Determine MoE parallel configuration. Based on the input tp_size_, @@ -144,7 +195,6 @@ def make(tp_size_: int, dp_size_: int, world_size_: int, tp_size_ (int): tp_size passed into the FusedMoE constructor. dp_size_ (int): dp_size passed into the FusedMoE constructor. ep_size_ (int): ep_size passed into the FusedMoE constructor. - world_size_ (int): the world size of the current All2All manager. vllm_parallel_config (ParallelConfig): vllm's parallel config object. @@ -223,7 +273,6 @@ def flatten_tp_across_dp(dp_rank: int): dp_rank=dp_rank, ep_size=1, ep_rank=0, - world_size=world_size_, use_ep=False) # DP + EP / TP + EP / DP + TP + EP assert use_ep @@ -237,7 +286,6 @@ def flatten_tp_across_dp(dp_rank: int): dp_rank=dp_rank, ep_size=ep_size, ep_rank=ep_rank, - world_size=world_size_, use_ep=True) @@ -263,6 +311,8 @@ def __post_init__(self): logger.debug("Using FusedMoEConfig::max_num_tokens=%d", self.max_num_tokens) + assert self.max_num_tokens > 0 + @property def quant_dtype(self) -> Optional[torch.dtype]: if self.quant_config is not None: @@ -303,10 +353,6 @@ def dp_size(self): def ep_size(self): return self.moe_parallel_config.ep_size - @property - def world_size(self): - return self.moe_parallel_config.world_size - @property def tp_rank(self): return self.moe_parallel_config.tp_rank diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py index 68ce6bcccb5d..e67ff6688210 100644 --- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional import torch diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 0ef4e4f767e3..0f41414c4896 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -7,12 +7,17 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP) -from vllm.model_executor.layers.fused_moe.utils import _fp8_perm, _resize_cache +from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm, + _fp8_quantize, + _resize_cache) from vllm.scalar_type import scalar_types +logger = init_logger(__name__) + def run_cutlass_moe_fp8( output: torch.Tensor, @@ -41,10 +46,7 @@ def run_cutlass_moe_fp8( assert w2_scale is not None assert w1.dtype == torch.float8_e4m3fn assert w2.dtype == torch.float8_e4m3fn - if expert_num_tokens is None: - assert a1q.size(1) == w1.size(2), "Hidden size mismatch w1" - else: - assert a1q.size(2) == w1.size(2), "Hidden size mismatch w1" + assert a1q.size(-1) == w1.size(2), "Hidden size mismatch w1" assert w1.size(1) == w2.size(2) * 2, "Hidden size mismatch w2" assert w1_scale.dim() == 1 or w1_scale.size( 1) == 1 or w1_scale.shape[1] == w1.size(1), "W1 scale shape mismatch" @@ -178,6 +180,8 @@ def run_cutlass_moe_fp8( c2 = _resize_cache(workspace2, (M * topk, N)) c3 = _resize_cache(workspace13, (M * topk, K)) + c1.fill_(0) + ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets, problem_sizes1, ab_strides1, ab_strides1, c_strides1, per_act_token, per_out_ch) @@ -213,6 +217,7 @@ def __init__( per_act_token_quant: bool, per_out_ch_quant: bool, block_shape: Optional[list[int]] = None, + num_dispatchers: Optional[int] = None, use_batched_format: bool = False, ): super().__init__( @@ -223,7 +228,9 @@ def __init__( block_shape=block_shape, )) assert max_experts_per_worker > 0 + assert not use_batched_format or num_dispatchers is not None self.max_experts_per_worker = max_experts_per_worker + self.num_dispatchers = num_dispatchers self.out_dtype = out_dtype self.use_batched_format = use_batched_format @@ -260,8 +267,12 @@ def workspace_shapes( output: tuple[int, ...] = () if self.use_batched_format: padded_M = aq.size(1) - workspace1 = (self.max_experts_per_worker, padded_M, max(N, K)) - workspace2 = (self.max_experts_per_worker, padded_M, (N // 2)) + num_dp = self.num_dispatchers + assert num_dp is not None + workspace1 = (self.max_experts_per_worker, padded_M * num_dp, + max(N, K)) + workspace2 = (self.max_experts_per_worker, padded_M * num_dp, + (N // 2)) output = (self.max_experts_per_worker, padded_M, K) else: workspace1 = (M * topk, max(2 * N, K)) @@ -311,7 +322,7 @@ def cutlass_moe_fp8( topk_ids: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, - per_act_token: bool, + per_act_token: Optional[bool] = None, activation: str = "silu", a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, @@ -355,6 +366,9 @@ def cutlass_moe_fp8( Returns: - torch.Tensor: The fp16 output tensor after applying the MoE layer. """ + if per_act_token is None: + per_act_token = a1_scale.numel() != 1 if a1_scale is not None else ( + a2_scale.numel() != 1 if a2_scale is not None else False) per_out_ch = w1_scale.numel() != w1_q.size(0) num_experts = global_num_experts if global_num_experts != -1 else w1_q.size( @@ -502,3 +516,130 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor, out = (c2.view(m, num_topk, k) * topk_weights.view(m, num_topk, 1).half()).sum(dim=1) return out.to(dtype=out_dtype) + + +def _valid_cutlass_block_scaled_grouped_gemm(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor) -> bool: + + def _valid_cutlass_block_scaled_grouped_gemm_shape(M: int, N: int, K: int): + return M >= 128 and N % 128 == 0 and K % 128 == 0 + + m = hidden_states.size(0) + _, K, N = w2.size() + if not _valid_cutlass_block_scaled_grouped_gemm_shape(m, N, K): + logger.debug( + "CutlassBlockScaledGroupedGemm disabled: unalinged problem size.") + return False + + if (w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn): + logger.debug( + "CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s).") + return False + + return True + + +def run_cutlass_block_scaled_fused_experts( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, +) -> torch.Tensor: + w1_q = w1.transpose(1, 2) + w2_q = w2.transpose(1, 2) + w1_scale = w1_scale.transpose(1, 2) + w2_scale = w2_scale.transpose(1, 2) + + assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" + assert a.shape[0] == topk_ids.shape[ + 0], "a and topk_ids must have the same batch size" + assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn" + assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn" + assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1" + assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2" + assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch" + assert w1_q.shape[0] == w1_scale.shape[ + 0], "w1_scale expert number mismatch" + assert w1_q.shape[0] == w2_scale.shape[ + 0], "w2_scale expert number mismatch" + assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype" + + out_dtype = a.dtype + num_experts = w1_q.size(0) + m = a.size(0) + k = w1_q.size(1) + n = w2_q.size(1) + + expert_offsets = torch.empty((num_experts + 1, ), + dtype=torch.int32, + device="cuda") + problem_sizes1 = torch.empty((num_experts, 3), + dtype=torch.int32, + device="cuda") + problem_sizes2 = torch.empty((num_experts, 3), + dtype=torch.int32, + device="cuda") + + topk = topk_ids.size(1) + + a_q, a1_scale = _fp8_quantize(a, + A_scale=None, + per_act_token=False, + block_shape=[128, 128]) + device = a_q.device + + a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) + c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) + + ops.get_cutlass_moe_mm_data( + topk_ids, + expert_offsets, + problem_sizes1, + problem_sizes2, + a_map, + c_map, + num_experts, + n, + k, + ) + + rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype) + rep_a1_scales = a1_scale[a_map] + + c1 = torch.empty((m * topk, n * 2), dtype=out_dtype, device=device) + c2 = torch.empty((m * topk, k), dtype=out_dtype, device=device) + + ops.cutlass_blockwise_scaled_grouped_mm( + c1, + rep_a_q, + w1_q, + rep_a1_scales, + w1_scale, + problem_sizes1, + expert_offsets[:-1], + ) + + intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device) + torch.ops._C.silu_and_mul(intermediate, c1) + + intermediate_q, a2_scale = _fp8_quantize(intermediate, + A_scale=None, + per_act_token=False, + block_shape=[128, 128]) + + ops.cutlass_blockwise_scaled_grouped_mm( + c2, + intermediate_q, + w2_q, + a2_scale, + w2_scale, + problem_sizes2, + expert_offsets[:-1], + ) + + return (c2[c_map].view(m, topk, k) * + topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index d8ddec9554f0..b625c28d4070 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import deep_ep @@ -16,12 +17,11 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): Prepare/Finalize using DeepEP High-Throughput kernels. """ - def __init__(self, buffer: deep_ep.Buffer, world_size: int, rank: int, + def __init__(self, buffer: deep_ep.Buffer, num_dispatchers: int, dp_size: int, rank_expert_offset: int): super().__init__() self.buffer = buffer - self.world_size = world_size - self.rank = rank + self.num_dispatchers_ = num_dispatchers self.dp_size = dp_size self.rank_expert_offset = rank_expert_offset # The dispatch function returns a handle that the combine function @@ -32,6 +32,9 @@ def __init__(self, buffer: deep_ep.Buffer, world_size: int, rank: int, # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164 self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160] + def num_dispatchers(self) -> int: + return self.num_dispatchers_ + @property def activation_format(self) -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard @@ -136,20 +139,7 @@ def prepare( "apply_router_weight_on_input is only implemented for topk=1") a1 = a1 * topk_weights.to(a1.dtype) - # Check if there is a block_shape / or if we can infer the quantization - # schemes from the scales. - per_token_quant = None - if all([ - x is None - for x in [quant_config.block_shape, a1_scale, a2_scale] - ]) and quant_config.quant_dtype is not None: - # Quantization required despite none of the inputs suggesting - # quantization. Fallback to per_token_dynamic quant. - per_token_quant = True - else: - per_token_quant = False - - if per_token_quant: + if quant_config.per_act_token_quant: a1q, a1q_scale = moe_kernel_quantize_input( a1, a1_scale, diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index b315b4a97f04..78ac4acc495d 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union import deep_ep @@ -7,7 +8,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.utils import ( - maybe_fix_scales, moe_kernel_quantize_input) + moe_kernel_quantize_input, normalize_batched_scales_shape) # DeepEP kernels quantize dispatch inputs in 128 element chunks. DEEPEP_QUANT_BLOCK_SIZE = 128 @@ -42,20 +43,21 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def __init__(self, buffer: deep_ep.Buffer, max_tokens_per_rank: int, - world_size: int, - dp_size: int, + num_dispatchers: int, use_fp8_dispatch: bool = False): super().__init__() self.buffer = buffer self.max_tokens_per_rank = max_tokens_per_rank - self.world_size = world_size - self.dp_size = dp_size self.use_fp8_dispatch = use_fp8_dispatch # The dispatch function returns a handle that the combine function # requires. We store the handle here so it is available to the # combine function. self.handle = None + self.num_dispatchers_ = num_dispatchers + + def num_dispatchers(self) -> int: + return self.num_dispatchers_ @property def activation_format(self) -> mk.FusedMoEActivationFormat: @@ -91,8 +93,6 @@ def _do_quant( assert isinstance(x, torch.Tensor) - assert not per_act_token_quant - num_experts, max_tokens, hidden_dim = x.size() # TODO (varun): Optimization - Use a batched version of quant @@ -104,7 +104,7 @@ def _do_quant( if quant_dtype is not None: assert x_scales is not None - x_scales = maybe_fix_scales(x_scales, num_experts) + x_scales = normalize_batched_scales_shape(x_scales, num_experts) return x, x_scales diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 37a109857ac3..0355abbf1d2b 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -12,42 +12,49 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( get_config_dtype_str, try_get_optimal_moe_config) from vllm.model_executor.layers.fused_moe.utils import ( - _resize_cache, moe_kernel_quantize_input) + _resize_cache, moe_kernel_quantize_input, normalize_batched_scales_shape, + normalize_scales_shape) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + group_broadcast) @triton.jit def moe_mmk( - a_ptrs, - b_ptrs, - K, - expert_id, - a_scale_ptr, - b_scale_ptr, - # The stride variables represent how much to increase the ptr by when - # moving by 1 element in a particular dimension. E.g. `stride_am` is - # how much to increase `a_ptr` by to get the element one row down - # (A has M rows). - stride_ak, - stride_bk, - stride_asm, - stride_ask, - stride_bse, - stride_bsk, - stride_bsn, - # Offsets and masks - offs_m, - offs_n, - mask_m, - # Block size for block-wise quantization - group_n: tl.constexpr, - group_k: tl.constexpr, - # Meta-parameters - BLOCK_M: tl.constexpr, - BLOCK_N: tl.constexpr, - BLOCK_K: tl.constexpr, - compute_type: tl.constexpr, - use_w8a8: tl.constexpr, - use_w8a16: tl.constexpr): + a_ptrs, + b_ptrs, + K, + expert_id, + a_scale_ptr, + b_scale_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ak: tl.int64, + stride_bk: tl.int64, + stride_ase: tl.int64, + stride_asm: tl.int64, + stride_ask: tl.int64, + stride_bse: tl.int64, + stride_bsk: tl.int64, + stride_bsn: tl.int64, + # Offsets and masks + offs_m, + offs_n, + offs_bn, + mask_m, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + # Meta-parameters + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + compute_type: tl.constexpr, + use_w8a8: tl.constexpr, + use_w8a16: tl.constexpr, + per_act_token_quant: tl.constexpr, +): offs_k = tl.arange(0, BLOCK_K) @@ -60,13 +67,22 @@ def moe_mmk( # block-wise if group_k > 0 and group_n > 0: a_scale_ptrs = a_scale_ptr + offs_m * stride_asm - offs_bsn = offs_n // group_n - b_scale_ptrs = (b_scale_ptr + expert_id * stride_bse + - offs_bsn * stride_bsn) + offs_bsn = offs_bn // group_n + b_scale_ptrs = b_scale_ptr + offs_bsn * stride_bsn + + # per act token + elif per_act_token_quant: + # Load per-token scale for activations + a_scale_ptrs = a_scale_ptr + offs_m * stride_asm + a_scale = tl.load(a_scale_ptrs, mask=mask_m, other=0.0)[:, None] + + b_scale_ptrs = b_scale_ptr + offs_bn[None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) + # tensor-wise else: a_scale = tl.load(a_scale_ptr) - b_scale = tl.load(b_scale_ptr + expert_id) + b_scale = tl.load(b_scale_ptr) # ----------------------------------------------------------- # Iterate to compute a block of the C matrix. @@ -96,13 +112,11 @@ def moe_mmk( accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] else: - if use_w8a8: - # acc used to enable fp8_fast_accum - accumulator = tl.dot(a, b, acc=accumulator) - else: - accumulator += tl.dot(a, b) + # acc used to enable fp8_fast_accum + accumulator = tl.dot(a, b, acc=accumulator) else: accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. a_ptrs += BLOCK_K * stride_ak b_ptrs += BLOCK_K * stride_bk @@ -122,47 +136,53 @@ def moe_mmk( @triton.jit def expert_triton_kernel( - a_ptr, #[max_tokens, K] - b_ptr, #[K, N] - c_ptr, #[max_tokens, N] - expert_id, - compute_type: tl.constexpr, - # Dimensions - M, - N, - K, - # Quantization data - a_scale_ptr, - b_scale_ptr, - b_zp_ptr, - # strides - stride_am, - stride_ak, - stride_bk, - stride_bn, - stride_cm, - stride_cn, - stride_asm, - stride_ask, - stride_bse, - stride_bsk, - stride_bsn, - # Blockwise quantization data - group_n, - group_k, - # Quantization schemes - use_fp8_w8a8: tl.constexpr, - use_int8_w8a16: tl.constexpr, - # Kernel config - BLOCK_M: tl.constexpr, - BLOCK_N: tl.constexpr, - BLOCK_K: tl.constexpr): + a_ptr, #[max_tokens, K] + b_ptr, #[K, N] + c_ptr, #[max_tokens, N] + expert_id, + compute_type: tl.constexpr, + # Dimensions + M, + N, + K, + # Quantization data + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + # strides + stride_am: tl.int64, + stride_ak: tl.int64, + stride_bk: tl.int64, + stride_bn: tl.int64, + stride_cm: tl.int64, + stride_cn: tl.int64, + stride_ase: tl.int64, + stride_asm: tl.int64, + stride_ask: tl.int64, + stride_bse: tl.int64, + stride_bsk: tl.int64, + stride_bsn: tl.int64, + # offsets + offs_bn, + # Blockwise quantization data + group_n, + group_k, + # Quantization schemes + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + per_act_token_quant: tl.constexpr, + # Kernel config + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, +): offs_m = tl.arange(0, BLOCK_M) offs_n = tl.arange(0, BLOCK_N) % N offs_k = tl.arange(0, BLOCK_K) mask_m = offs_m < M + # Make grids of a + b pointers a_ptrs = a_ptr + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak b_ptrs = b_ptr + offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn @@ -179,6 +199,7 @@ def expert_triton_kernel( # (A has M rows). stride_ak, stride_bk, + stride_ase, stride_asm, stride_ask, stride_bse, @@ -187,6 +208,7 @@ def expert_triton_kernel( # Offsets and masks offs_m, offs_n, + offs_bn, mask_m, # Block size for block-wise quantization group_n, @@ -197,7 +219,8 @@ def expert_triton_kernel( BLOCK_K, compute_type, use_fp8_w8a8, - use_int8_w8a16) + use_int8_w8a16, + per_act_token_quant) # store in C offs_cn = tl.arange(0, BLOCK_N) @@ -208,53 +231,57 @@ def expert_triton_kernel( @triton.jit def batched_triton_kernel( - a_ptr, # [E, max_num_tokens, K] - b_ptr, # [E, K, N] - c_ptr, # [E, max_num_tokens, N] - expert_num_tokens, # [E] - compute_type: tl.constexpr, - # Dimensions - max_num_tokens, - K, - N, - # Quantization data - a_scale_ptr, - b_scale_ptr, - b_zp_ptr, - # The stride variables represent how much to increase the ptr by when - # moving by 1 element in a particular dimension. E.g. `stride_am` is - # how much to increase `a_ptr` by to get the element one row down - # (A has M rows). - stride_ae, - stride_am, - stride_ak, - stride_be, - stride_bk, - stride_bn, - stride_ce, - stride_cm, - stride_cn, - stride_asm, - stride_ask, - stride_bse, - stride_bsk, - stride_bsn, - # Blockwise quantization data - group_n: tl.constexpr, - group_k: tl.constexpr, - # Quantization schemes - use_fp8_w8a8: tl.constexpr, - use_int8_w8a16: tl.constexpr, - # Kernel config - BLOCK_M: tl.constexpr, - BLOCK_N: tl.constexpr, - BLOCK_K: tl.constexpr): + a_ptr, # [E, max_num_tokens, K] + b_ptr, # [E, K, N] + c_ptr, # [E, max_num_tokens, N] + expert_num_tokens, # [E] + compute_type: tl.constexpr, + # Dimensions + max_num_tokens, + K, + N, + # Quantization data + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ae: tl.int64, + stride_am: tl.int64, + stride_ak: tl.int64, + stride_be: tl.int64, + stride_bk: tl.int64, + stride_bn: tl.int64, + stride_ce: tl.int64, + stride_cm: tl.int64, + stride_cn: tl.int64, + stride_ase: tl.int64, + stride_asm: tl.int64, + stride_ask: tl.int64, + stride_bse: tl.int64, + stride_bsk: tl.int64, + stride_bsn: tl.int64, + # Blockwise quantization data + group_n: tl.constexpr, + group_k: tl.constexpr, + # Quantization schemes + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + per_act_token_quant: tl.constexpr, + # Kernel config + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, +): expert_id = tl.program_id(axis=0) e_num_tokens = tl.load(expert_num_tokens + expert_id) if e_num_tokens == 0: # Early exit return + # axis 1 is M_blocks * N_blocks pid_mn = tl.program_id(axis=1) #num_pid_m = tl.cdiv(max_num_tokens, BLOCK_M) num_pid_n = tl.cdiv(N, BLOCK_N) @@ -275,6 +302,16 @@ def batched_triton_kernel( c_ptr = (c_ptr + expert_id * stride_ce + cta_m_start * stride_cm + cta_n_start * stride_cn) + offs_bn = (pid_n * BLOCK_N + tl.arange(0, BLOCK_N).to(tl.int64)) % N + + if use_fp8_w8a8: + a_scale_ptr = a_scale_ptr + expert_id * stride_ase + b_scale_ptr = b_scale_ptr + expert_id * stride_bse + + # block-wise + if group_k > 0 and group_n > 0 or per_act_token_quant: + a_scale_ptr = a_scale_ptr + cta_m_start * stride_asm + expert_triton_kernel( a_ptr, b_ptr, @@ -294,17 +331,21 @@ def batched_triton_kernel( stride_bn, stride_cm, stride_cn, + stride_ase, stride_asm, stride_ask, stride_bse, stride_bsk, stride_bsn, + # offsets + offs_bn, # Blockwise quantization data group_n, group_k, # Quantization schemes use_fp8_w8a8, use_int8_w8a16, + per_act_token_quant, # Kernel config BLOCK_M, BLOCK_N, @@ -326,6 +367,7 @@ def invoke_moe_batched_triton_kernel( use_int8_w8a16: bool, use_int4_w4a16: bool, config: dict[str, int], + per_act_token_quant: bool, block_shape: Optional[list[int]] = None): assert not use_int4_w4a16 @@ -340,6 +382,42 @@ def invoke_moe_batched_triton_kernel( grid = (expert_num_tokens.size(0), triton.cdiv(max_num_tokens, BLOCK_M) * triton.cdiv(B.size(1), BLOCK_N)) + A_scale = normalize_batched_scales_shape(A_scale, + expert_num_tokens.shape[0]) + + if B_scale is not None and B_scale.ndim == 1: + assert B_scale.numel() == expert_num_tokens.shape[0] + B_scale = B_scale.view(-1, 1, 1) + + assert A_scale is None or A_scale.ndim == 3, ( + f"{0 if A_scale is None else A_scale.shape}") + assert B_scale is None or B_scale.ndim == 1 or B_scale.ndim == 3, ( + f"{0 if B_scale is None else B_scale.shape}") + + if B_scale is not None: + if B_scale.ndim == 1: + stride_bse = 1 + stride_bsk = 0 + stride_bsn = 0 + else: + stride_bse = B_scale.stride(0) + stride_bsk = B_scale.stride(2) + stride_bsn = B_scale.stride(1) + + else: + stride_bse = 0 + stride_bsk = 0 + stride_bsn = 0 + + if A_scale is not None: + stride_ase = A_scale.stride(0) + stride_asm = A_scale.stride(1) + stride_ask = A_scale.stride(2) + else: + stride_ase = 0 + stride_asm = 0 + stride_ask = 0 + batched_triton_kernel[grid]( A, B, @@ -364,17 +442,19 @@ def invoke_moe_batched_triton_kernel( C.stride(0), C.stride(1), C.stride(2), - A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0, - A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0, - B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0, - B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0, - B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, + stride_ase, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, # Blockwise quantization data 0 if block_shape is None else block_shape[0], 0 if block_shape is None else block_shape[1], # Quantization schemes use_fp8_w8a8, use_int8_w8a16, + per_act_token_quant, # Kernel config BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, @@ -391,15 +471,15 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def __init__( self, max_num_tokens: int, - world_size: int, - dp_size: int, + num_local_experts: int, + num_dispatchers: int, rank: int, ): super().__init__() - self.world_size = world_size - self.dp_size = dp_size - self.rank = rank self.max_num_tokens = max_num_tokens + self.num_local_experts = num_local_experts + self.rank = rank + self.num_dispatchers_ = num_dispatchers @property def activation_format(self) -> mk.FusedMoEActivationFormat: @@ -411,6 +491,9 @@ def max_num_tokens_per_rank(self) -> Optional[int]: def topk_indices_dtype(self) -> Optional[torch.dtype]: return None + def num_dispatchers(self) -> int: + return self.num_dispatchers_ + def prepare( self, a1: torch.Tensor, @@ -442,9 +525,7 @@ def prepare( dtype=torch.int, device=a1.device) - assert num_experts % self.world_size == 0 - - num_local_experts = num_experts // self.world_size + num_local_experts = self.num_local_experts if quant_config.quant_dtype is None: b_type = a1.dtype @@ -456,21 +537,53 @@ def prepare( dtype=b_type, device=a1.device) - b_a1_scale = None + if quant_config.is_quantized: + scale_shape = quant_config.batched_scale_shape( + num_local_experts, self.max_num_tokens, hidden_dim) - assert quant_config.quant_dtype is None, "quantization NYI" + b_a1_scale = torch.empty(scale_shape, + dtype=torch.float32, + device=a1.device) + else: + assert a1_scale is None + b_a1_scale = None first_expert = num_local_experts * self.rank last_expert = first_expert + num_local_experts + a1_scale = normalize_scales_shape(a1_scale) + a2_scale = normalize_scales_shape(a2_scale) + for expert_id in range(first_expert, last_expert): topks = torch.any(topk_ids == expert_id, dim=1).flatten() rows = torch.count_nonzero(topks.flatten()) if rows == 0: continue idx = expert_id - first_expert - b_a1[idx, :rows, :] = a1[:topks.numel()][topks] tokens_per_expert[idx] = rows + rhs = a1[:topks.numel()][topks] + if quant_config.quant_dtype is not None: + if a1_scale is not None: + if quant_config.is_per_act_token: + rhs_a1_scale = a1_scale[:topks.numel()][topks] + else: + rhs_a1_scale = a1_scale + else: + rhs_a1_scale = None + b_a1[idx, :rows, :], b_s = moe_kernel_quantize_input( + rhs, + rhs_a1_scale, + quant_config.quant_dtype, + quant_config.per_act_token_quant, + quant_config.block_shape, + ) + assert b_s is not None + if quant_config.is_per_act_token: + b_a1_scale[idx, :rows] = b_s[:rows] + else: + b_a1_scale[idx, :b_s.shape[0]] = b_s + else: + b_a1[idx, :rows, :] = rhs assert b_a1_scale is None or b_a1_scale.ndim == 3 @@ -514,8 +627,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, max_num_tokens: int, - world_size: int, - dp_size: int, + num_dispatchers: int, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, use_int8_w8a16: bool = False, @@ -532,13 +644,11 @@ def __init__( per_act_token_quant=per_act_token_quant, block_shape=block_shape, )) - assert not use_fp8_w8a8, "NYI" assert not use_int8_w8a8, "NYI" assert not use_int8_w8a16, "NYI" assert not use_int4_w4a16, "NYI" self.max_num_tokens = max_num_tokens - self.world_size = world_size - self.dp_size = dp_size + self.num_dispatchers = num_dispatchers @property def activation_formats( @@ -565,11 +675,21 @@ def workspace_shapes( local_num_experts: int, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: assert a.dim() == 2 - num_dp = self.dp_size + num_dp = self.num_dispatchers num_experts = local_num_experts workspace13 = (num_experts, self.max_num_tokens * num_dp, K) workspace2 = (self.max_num_tokens * num_dp, N) - return (workspace13, workspace2, workspace13, a.dtype) + output = workspace13 + return (workspace13, workspace2, output, a.dtype) + + def dequant(self, t: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: + assert self.quant_config.is_quantized + f32 = torch.float32 + if (self.quant_config.is_per_act_token + or self.quant_config.is_per_tensor): + return t.to(f32) * scale + else: + return t.to(f32) * group_broadcast(scale, t.shape) def apply( self, @@ -612,9 +732,95 @@ def apply( continue tmp = _resize_cache(workspace2, (num, N)) - input = hidden_states[expert, :num, :] @ w1[expert].transpose(0, 1) - self.activation(activation, tmp, input) - output[expert, :num, :] = tmp @ w2[expert].transpose(0, 1) + + if self.quant_config.is_quantized: + assert a1q_scale is not None and w1_scale is not None + input = self.dequant(hidden_states[expert, :, :], + a1q_scale[expert]) + w1_dq = self.dequant(w1[expert], w1_scale[expert]) + input = input[:num] @ w1_dq.transpose(0, 1) + else: + input = hidden_states[expert, :num, :] @ w1[expert].transpose( + 0, 1) + + self.activation(activation, tmp, input.to(tmp.dtype)) + + if self.quant_config.is_quantized: + assert w2_scale is not None + w2_dq = self.dequant(w2[expert], w2_scale[expert]) + else: + w2_dq = w2[expert] + + output[expert, :num, :] = tmp @ w2_dq.transpose(0, 1).to(tmp.dtype) + + +def batched_moe_kernel_quantize_input( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + num_tokens: int, + E: int, + N: int, + expert_num_tokens: torch.Tensor, + qtype: Optional[torch.dtype], + per_act_token_quant: bool, + block_shape: Optional[list[int]] = None, +) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + if (torch.compiler.is_compiling() + or torch.cuda.is_current_stream_capturing()): + # Note: this does a bunch of extra work because expert_num_tokens is + # ignored but it does support torch.compile + cudagraphs. + hidden_dim = A.size(-1) + assert A_scale is None or A_scale.ndim <= 2, ( + f"{A_scale.shape if A_scale is not None else None}") + A_q, A_q_scale = moe_kernel_quantize_input(A.view(-1, + hidden_dim), A_scale, + qtype, per_act_token_quant, + block_shape) + A_q = A_q.view(E, -1, hidden_dim) + A_q_scale = normalize_batched_scales_shape(A_q_scale, E) + + return A_q, A_q_scale + elif qtype is None: + return A, normalize_batched_scales_shape(A_scale, E) + else: + A_q = torch.empty_like(A, dtype=qtype) + + if per_act_token_quant: + assert block_shape is None + scale_shape = (E, num_tokens, 1) + elif block_shape is not None: + _, block_k = block_shape + k_tiles = (A.shape[-1] + block_k - 1) // block_k + scale_shape = (E, num_tokens, k_tiles) + else: + scale_shape = (E, 1, 1) + + A_q_scale = torch.zeros(scale_shape, + dtype=torch.float32, + device=A.device) + + num_experts = expert_num_tokens.numel() + + A_scale = normalize_batched_scales_shape(A_scale, num_experts) + + for e in range(E): + num_tokens = int(expert_num_tokens[e].item()) + if num_tokens > 0: + if A_scale is not None: + scales = A_scale[e, :min(num_tokens, A_scale.shape[1])] + else: + scales = None + A_q[e, :num_tokens], tmp_scale = moe_kernel_quantize_input( + A[e, :num_tokens], + scales, + qtype, + per_act_token_quant, + block_shape, + ) + assert tmp_scale is not None + A_q_scale[e, :tmp_scale.shape[0]] = tmp_scale + + return A_q, A_q_scale class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): @@ -627,8 +833,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, max_num_tokens: int, - world_size: int, - dp_size: int, + num_dispatchers: int, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, use_int8_w8a16: bool = False, @@ -648,17 +853,14 @@ def __init__( assert not use_int8_w8a8, "NYI" assert not use_int8_w8a16, "NYI" assert not use_int4_w4a16, "NYI" + assert max_num_tokens > 0 + assert num_dispatchers > 0 self.use_fp8_w8a8 = use_fp8_w8a8 self.use_int8_w8a8 = use_int8_w8a8 self.use_int4_w4a16 = use_int4_w4a16 self.use_int8_w8a16 = use_int8_w8a16 self.max_num_tokens = max_num_tokens - self.world_size = world_size - self.dp_size = dp_size - assert world_size > 0 - assert dp_size > 0 - assert dp_size <= world_size - assert max_num_tokens > 0 + self.num_dispatchers = num_dispatchers @property def activation_formats( @@ -685,7 +887,7 @@ def workspace_shapes( local_num_experts: int, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: assert a.dim() == 2 - num_dp = self.world_size + num_dp = self.num_dispatchers num_experts = local_num_experts max_num_tokens = self.max_num_tokens workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N)) @@ -772,51 +974,48 @@ def apply( if self.use_fp8_w8a8: intermediate_cache1.fill_(0) + a1q_scale = normalize_batched_scales_shape(a1q_scale, E) + # MM1 - invoke_moe_batched_triton_kernel(A=hidden_states, - B=w1, - C=intermediate_cache1, - expert_num_tokens=expert_num_tokens, - compute_type=compute_type, - A_scale=a1q_scale, - B_scale=w1_scale, - B_zp=w1_zp, - use_fp8_w8a8=self.use_fp8_w8a8, - use_int8_w8a16=self.use_int8_w8a16, - use_int4_w4a16=self.use_int4_w4a16, - config=config, - block_shape=self.block_shape) + invoke_moe_batched_triton_kernel( + A=hidden_states, + B=w1, + C=intermediate_cache1, + expert_num_tokens=expert_num_tokens, + compute_type=compute_type, + A_scale=a1q_scale, + B_scale=w1_scale, + B_zp=w1_zp, + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + config=config, + per_act_token_quant=self.per_act_token_quant, + block_shape=self.block_shape) intermediate_cache2.fill_(0) - # TODO: would be nice to use expert_num_tokens here to reduce - # garbage compute + # TODO (bnell): use triton utility from batched deep gemm. self.activation(activation, intermediate_cache2.view(-1, N // 2), intermediate_cache1.view(-1, N)) - ic2_hidden_size = intermediate_cache2.size(-1) - intermediate_cache2 = intermediate_cache2.view(-1, ic2_hidden_size) - - qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( - A=intermediate_cache2, - A_scale=a2_scale, - quant_dtype=self.quant_dtype, + qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input( + intermediate_cache2, a2_scale, max_num_tokens, E, N, + expert_num_tokens, self.quant_dtype, self.per_act_token_quant, + self.block_shape) + + invoke_moe_batched_triton_kernel( + A=qintermediate_cache2, + B=w2, + C=output, + expert_num_tokens=expert_num_tokens, + compute_type=compute_type, + A_scale=a2q_scale, + B_scale=w2_scale, + B_zp=w2_zp, + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + config=config, per_act_token_quant=self.per_act_token_quant, block_shape=self.block_shape) - - qintermediate_cache2 = qintermediate_cache2.view( - (E, -1, ic2_hidden_size)) - - invoke_moe_batched_triton_kernel(A=qintermediate_cache2, - B=w2, - C=output, - expert_num_tokens=expert_num_tokens, - compute_type=compute_type, - A_scale=a2q_scale, - B_scale=w2_scale, - B_zp=w2_zp, - use_fp8_w8a8=self.use_fp8_w8a8, - use_int8_w8a16=self.use_int8_w8a16, - use_int4_w4a16=self.use_int4_w4a16, - config=config, - block_shape=self.block_shape) diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 40b76994f412..1988c73ba7e2 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -24,6 +24,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, quant_type_id: int, + apply_router_weight_on_input: bool = False, global_num_experts: int = -1, expert_map: Optional[torch.Tensor] = None, global_scale1: Optional[torch.Tensor] = None, @@ -149,7 +150,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor, topk_weights, moe_block_size=block_size_m, top_k=topk, - mul_topk_weights=False, + mul_topk_weights=apply_router_weight_on_input, is_ep=expert_map is not None, b_q_type=quant_type, size_m=M, @@ -182,7 +183,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor, topk_weights, moe_block_size=block_size_m, top_k=1, - mul_topk_weights=True, + mul_topk_weights=not apply_router_weight_on_input, is_ep=expert_map is not None, b_q_type=quant_type, size_m=M * topk, @@ -208,6 +209,7 @@ def fused_marlin_moe_fake(hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, quant_type_id: int, + apply_router_weight_on_input: bool = False, global_num_experts: int = -1, global_scale1: Optional[torch.Tensor] = None, global_scale2: Optional[torch.Tensor] = None, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 75712b8e3a4d..fbbccbb34d90 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -15,6 +15,9 @@ # yapf: disable from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, get_config_quant_dtype) +from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + _valid_cutlass_block_scaled_grouped_gemm, + run_cutlass_block_scaled_fused_experts) # yapf: enable from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( _valid_deep_gemm, deep_gemm_moe_fp8) @@ -1127,29 +1130,33 @@ def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]: return torch_vllm_outplace_fused_experts -def fused_experts(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - inplace: bool = False, - activation: str = "silu", - apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, - allow_deep_gemm: bool = False) -> torch.Tensor: +# TODO (bnell): replace this with modular op. Can get rid of inplace/outplace +# torch ops. +def fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[list[int]] = None, + allow_deep_gemm: bool = False, + allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor: # For now, disable DeepGemm for small N (<= 512) until better # permute/unpermute ops are available. N = w1.size(1) @@ -1172,6 +1179,17 @@ def fused_experts(hidden_states: torch.Tensor, a2_scale=a2_scale, apply_router_weight_on_input=apply_router_weight_on_input, ) + elif (allow_cutlass_block_scaled_grouped_gemm and use_fp8_w8a8 + and _valid_cutlass_block_scaled_grouped_gemm(hidden_states, w1, w2)): + assert apply_router_weight_on_input is False + return run_cutlass_block_scaled_fused_experts( + a=hidden_states, + w1=w1, + w2=w2, + w1_scale=w1_scale, + w2_scale=w2_scale, + topk_weights=topk_weights, + topk_ids=topk_ids) else: return dispatch_fused_experts_func(inplace)( hidden_states=hidden_states, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 6f9770262856..36ac75a8df4b 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -14,7 +14,6 @@ from vllm.config import get_current_vllm_config from vllm.distributed import (get_dp_group, get_ep_group, get_tensor_model_parallel_world_size, - get_world_group, tensor_model_parallel_all_reduce) from vllm.distributed.eplb.eplb_state import EplbState from vllm.forward_context import ForwardContext, get_forward_context @@ -114,6 +113,9 @@ def init_prepare_finalize(self, moe: FusedMoEConfig, hidden_dim_scale_bytes=hidden_scale_bytes, ) + num_dispatchers = (all2all_manager.world_size // + all2all_manager.tp_group.world_size) + # Intranode pplx a2a takes a group name while internode does not. if not all2all_manager.internode: all_to_all_args[ @@ -124,10 +126,8 @@ def init_prepare_finalize(self, moe: FusedMoEConfig, prepare_finalize = PplxPrepareAndFinalize( handle, max_num_tokens=moe.max_num_tokens, - world_size=all2all_manager.world_size, - rank=all2all_manager.rank, - # dp_size actually means tp_size, bug in pplx kernels - dp_size=all2all_manager.tp_group.world_size, + num_local_experts=moe.num_local_experts, + num_dispatchers=num_dispatchers, ) elif moe.use_deepep_ht_kernels: assert moe.dp_size == all2all_manager.dp_world_size @@ -136,16 +136,13 @@ def init_prepare_finalize(self, moe: FusedMoEConfig, handle = all2all_manager.get_handle(all_to_all_args) prepare_finalize = DeepEPHTPrepareAndFinalize( handle, - world_size=all2all_manager.world_size, - rank=all2all_manager.rank, + num_dispatchers=all2all_manager.world_size, dp_size=all2all_manager.dp_world_size, rank_expert_offset=all2all_manager.rank * moe.num_local_experts, ) elif moe.use_deepep_ll_kernels: - assert moe.dp_size == all2all_manager.dp_world_size - all_to_all_args = dict( max_num_tokens_per_dp_rank=moe.max_num_tokens, token_hidden_size=moe.hidden_dim, @@ -168,8 +165,7 @@ def init_prepare_finalize(self, moe: FusedMoEConfig, prepare_finalize = DeepEPLLPrepareAndFinalize( handle, max_tokens_per_rank=moe.max_num_tokens, - world_size=all2all_manager.world_size, - dp_size=all2all_manager.dp_world_size, + num_dispatchers=all2all_manager.world_size, use_fp8_dispatch=use_fp8_dispatch, ) @@ -245,18 +241,12 @@ def select_gemm_impl( assert self.fused_experts == fused_experts - all2all_manager = get_ep_group().device_communicator.all2all_manager - assert all2all_manager is not None - if (prepare_finalize.activation_format == FusedMoEActivationFormat.BatchedExperts): logger.debug("BatchedTritonExperts %s", self.moe) - assert self.moe.dp_size == all2all_manager.dp_world_size return BatchedTritonExperts( max_num_tokens=self.moe.max_num_tokens, - world_size=all2all_manager.world_size, - # dp_size actually means tp_size, bug in pplx kernels - dp_size=all2all_manager.tp_group.world_size, + num_dispatchers=prepare_finalize.num_dispatchers(), ) else: logger.debug("TritonExperts %s", self.moe) @@ -652,14 +642,12 @@ def __init__( get_tensor_model_parallel_world_size()) dp_size_ = (dp_size if dp_size is not None else get_dp_group().world_size) - world_size_ = get_world_group().world_size vllm_config = get_current_vllm_config() self.moe_parallel_config: FusedMoEParallelConfig = ( FusedMoEParallelConfig.make( tp_size_=tp_size_, dp_size_=dp_size_, - world_size_=world_size_, vllm_parallel_config=vllm_config.parallel_config)) self.global_num_experts = num_experts + num_redundant_experts @@ -1186,9 +1174,9 @@ def select_experts( logical_replica_count: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, torch.Tensor]: """ - Route the input hidden states to the top-k experts based on the + Route the input hidden states to the top-k experts based on the router logits. - + Returns: (topk_weights, topk_ids) (tuple[torch.Tensor, torch.Tensor]): The weights and *global physical* expert ids of the top-k experts. @@ -1299,6 +1287,8 @@ def select_experts( topk_ids = topk_ids.to(dtype=indices_type) + assert topk_ids.dtype == indices_type or indices_type is None + return topk_weights, topk_ids def must_reduce_shared_expert_outputs(self) -> bool: @@ -1330,8 +1320,13 @@ def maybe_all_reduce_tensor_model_parallel( def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor): - return torch.ops.vllm.moe_forward(hidden_states, router_logits, - self.layer_name) + # TODO: Once the OOM issue for the TPU backend is resolved, we will + # switch to using the moe_forward custom op. + if current_platform.is_tpu(): + return self.forward_impl(hidden_states, router_logits) + else: + return torch.ops.vllm.moe_forward(hidden_states, router_logits, + self.layer_name) def forward_impl_chunked(self, full_hidden_states: torch.Tensor, full_router_logits: torch.Tensor): diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 2ffb4d328eca..f332b5168913 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -193,6 +193,10 @@ def max_num_tokens_per_rank(self) -> Optional[int]: """ raise NotImplementedError + @abstractmethod + def num_dispatchers(self) -> int: + raise NotImplementedError + class FusedMoEPermuteExpertsUnpermute(ABC): """ diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 45e813287d3f..112305a4f2d0 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -8,7 +8,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.utils import ( - moe_kernel_quantize_input) + _validate_scale_shape, moe_kernel_quantize_input) from vllm.utils import cdiv, round_up @@ -32,16 +32,16 @@ def pplx_hidden_dim_scale_bytes( elem_size = torch.float32.itemsize if per_act_token_quant: - # per-token + # per-token (M x 1) assert block_shape is None hidden_scale_bytes = elem_size elif block_shape is not None: - # per-group + # per-group (M x K_tiles) block_size = block_shape[1] num_blocks = cdiv(hidden_dim, block_size) hidden_scale_bytes = num_blocks * elem_size else: - # per-tensor + # per-tensor (1 x 1) hidden_scale_bytes = elem_size else: hidden_dim_bytes = hidden_dim * in_dtype.itemsize @@ -53,25 +53,22 @@ def pplx_hidden_dim_scale_bytes( ) -# The max_num_tokens, world_size and dp_size must be the same -# as the ones used to create the AllToAll. class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def __init__( self, a2a: pplx.AllToAll, max_num_tokens: int, - world_size: int, - rank: int, - dp_size: int, + num_local_experts: int, + num_dispatchers: int, ): super().__init__() assert max_num_tokens > 0 + assert num_local_experts > 0 self.a2a = a2a self.max_num_tokens = max_num_tokens - self.world_size = world_size - self.rank = rank - self.dp_size = dp_size + self.num_local_experts = num_local_experts + self.num_dispatchers_ = num_dispatchers @property def activation_format(self) -> mk.FusedMoEActivationFormat: @@ -83,6 +80,9 @@ def max_num_tokens_per_rank(self) -> Optional[int]: def topk_indices_dtype(self) -> Optional[torch.dtype]: return torch.uint32 + def num_dispatchers(self) -> int: + return self.num_dispatchers_ + def prepare( self, a1: torch.Tensor, @@ -120,42 +120,64 @@ def prepare( per_act_token_quant=quant_config.per_act_token_quant, block_shape=quant_config.block_shape) + _validate_scale_shape(a1q, a1q_scale, quant_config.per_act_token_quant, + quant_config.block_shape) + if a1q_scale is not None: - if a1q_scale.numel() == 1: - orig_a_scale_block_shape = 1 - else: - orig_a_scale_block_shape = a1q_scale.shape[-1] - a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols) + scalar_scales = a1q_scale.numel() == 1 + + # pplx requires 2-d scales even for scalar scales + if a1q_scale.dim() <= 1: + assert scalar_scales + a1q_scale = a1q_scale.view(1, 1) + + orig_a_scale_block_shape = a1q_scale.shape[-1] - # rem_experts need to be 0 for pplx to work properly. - rem_experts = num_experts % self.world_size - assert rem_experts == 0 - num_local_experts = ((num_experts // self.world_size) + - (1 if self.rank < rem_experts else 0)) + if not quant_config.is_block_quantized: + # TODO (bnell): use group_broadcast instead? + a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols) + + assert a1q_scale is None or a1q_scale.ndim == 2, \ + f"{0 if a1q_scale is None else (a1q_scale.ndim, a1q_scale.shape)}" expert_num_tokens = torch.empty( - num_local_experts, + self.num_local_experts, dtype=torch.int32, device=device, ) - num_dp = self.world_size // self.dp_size expert_x = torch.empty( - (num_local_experts, self.max_num_tokens * num_dp, hidden_dim), + (self.num_local_experts, + self.max_num_tokens * self.num_dispatchers(), hidden_dim), dtype=a1q.dtype, device=device, ) expert_x_scale: Optional[torch.Tensor] = None if a1q.dtype.itemsize == 1: - block_size = (quant_config.block_shape[1] - if quant_config.block_shape is not None else 1) + if quant_config.is_per_act_token: + # (M x 1) -> (E x M x K) + final_dim = expert_x.size(2) + elif quant_config.is_per_tensor: + # (1 x 1) -> (E x 1 x 1) + final_dim = 1 + else: + # (M x K_tiles) -> (E x M x K_tiles) + assert quant_config.block_shape is not None + num_blocks = cdiv(expert_x.size(2), + quant_config.block_shape[1]) + final_dim = num_blocks + + expert_x_scale_shape = ( + self.num_local_experts, + expert_x.size(1), + round_up(final_dim, 4) # round up for alignment + ) + expert_x_scale = torch.empty( - (num_local_experts, expert_x.size(1), - round_up( - (expert_x.size(2) + block_size - 1) // block_size, 4)), + expert_x_scale_shape, dtype=torch.float32, - device=device, + device=expert_x.device, ) # This argument is optional, defaults to indices.size(0) @@ -171,8 +193,10 @@ def prepare( indices=topk_ids, bound_m=bound_m, ) + if expert_x_scale is not None: expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape] + assert expert_x_scale.ndim == 3 return expert_x, expert_x_scale, expert_num_tokens, None, None @@ -184,13 +208,16 @@ def finalize( topk_ids: torch.Tensor, apply_router_weight_on_input: bool, ) -> None: - num_tokens = output.size(0) # M # This argument is optional # There's not much point setting this unless it is != topk_ids.size(0) bound_m: Optional[torch.Tensor] = None - assert topk_ids.size(0) == num_tokens, ( - f"{topk_ids.size(0)} == {num_tokens}") + # TODO (bnell): fails in test_pplx_moe.py, figure out what's going on + #num_tokens = output.size(0) # M + #assert topk_ids.size(0) == num_tokens, ( + # f"{topk_ids.size(0)} == {num_tokens}") + assert topk_ids.size() == topk_weights.size(), ( + f"{topk_ids.size()} == {topk_weights.size()}") assert output.size(0) <= self.max_num_tokens, ( f"{output.size(0)} <= {self.max_num_tokens}") assert output.size(1) == fused_expert_output.size(-1) diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index 9e4be82f6c1f..e1114efe5a3f 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -24,6 +24,9 @@ def max_num_tokens_per_rank(self) -> Optional[int]: def topk_indices_dtype(self) -> Optional[torch.dtype]: return None + def num_dispatchers(self) -> int: + return 1 + def prepare( self, a1: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 52346f797440..a90cce719b48 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -99,9 +99,20 @@ def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: return m[idx, ...] -# TODO(bnell): better name -def maybe_fix_scales(scales: Optional[torch.Tensor], - num_experts: int) -> Optional[torch.Tensor]: +def normalize_scales_shape( + scales: Optional[torch.Tensor]) -> Optional[torch.Tensor]: + if scales is not None: + if scales.numel() == 1: + scales = scales.view(1, 1) + else: + scales = scales.view(-1, scales.size(-1)) + return scales + + +def normalize_batched_scales_shape( + scales: Optional[torch.Tensor], + num_experts: int, +) -> Optional[torch.Tensor]: if scales is not None and scales.ndim < 3: if scales.numel() == 1: scales = scales.view(1) @@ -111,3 +122,23 @@ def maybe_fix_scales(scales: Optional[torch.Tensor], scales = scales.view(num_experts, -1, scales.size(-1)) return scales + + +def _validate_scale_shape( + a: torch.Tensor, + a_scale: Optional[torch.Tensor], + per_act_token_quant: bool, + block_shape: Optional[list[int]], +) -> None: + if a_scale is None: + return + + if not per_act_token_quant and block_shape is None: + assert a_scale.numel() == 1, f"{a_scale.shape}" + elif per_act_token_quant: + assert a_scale.shape[0] == a.shape[0] and a_scale.shape[1] == 1, ( + f"{a_scale.shape[0]} == {a.shape[0]} and {a_scale.shape[1]} == 1") + else: + assert block_shape is not None + expected = (a.shape[0], cdiv(a.shape[1], block_shape[1])) + assert a_scale.shape == expected, f"{a_scale.shape} == {expected}" diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py index ccfb278cdff6..3f67fc35afdf 100644 --- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py +++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py @@ -108,7 +108,7 @@ def _selective_scan_update_kernel( # is the same as the batch id. if HAS_STATE_BATCH_INDICES: state_batch_indices_ptr += pid_b - state_batch_idx = tl.load(state_batch_indices_ptr) + state_batch_idx = tl.load(state_batch_indices_ptr).to(tl.int64) state_ptr += (state_batch_idx * stride_state_batch + pid_h * stride_state_head) else: diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index aff54bc495b2..0fdded0b5a7f 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -493,11 +493,6 @@ def apply( assert activation == "silu", "Only SiLU activation is supported." - if apply_router_weight_on_input: - raise NotImplementedError( - "Apply router weight on input is not supported for" - "fused Marlin MoE method.") - topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -520,6 +515,7 @@ def apply( topk_weights, topk_ids, quant_type_id=self.quant_type.id, + apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, expert_map=expert_map, w1_zeros=layer.w13_qzeros, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 5d7e00c2b81b..ef67cc0eda46 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -322,6 +322,7 @@ def apply( global_scale1=layer.w13_weight_scale_2, global_scale2=layer.w2_weight_scale_2, quant_type_id=scalar_types.float4_e2m1f.id, + apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, expert_map=expert_map) @@ -367,6 +368,7 @@ def __init__( "weights") self.input_quant = self.quant_config.target_scheme_map["Linear"].get( "input_activations") + self.topk_indices_dtype = None per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR and self.input_quant.strategy @@ -573,6 +575,41 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: from vllm.model_executor.layers.fused_moe import fused_experts self.fused_experts_func = fused_experts + def select_gemm_impl( + self, + prepare_finalize: FusedMoEPrepareAndFinalize, + moe: FusedMoEConfig, + ) -> FusedMoEPermuteExpertsUnpermute: + from vllm.model_executor.layers.fused_moe import TritonExperts + from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + BatchedTritonExperts) + + assert not self.rocm_aiter_moe_enabled and not self.use_marlin + + logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__) + + if (prepare_finalize.activation_format == + FusedMoEActivationFormat.BatchedExperts): + max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank( + ) + assert max_num_tokens_per_rank is not None + + return BatchedTritonExperts( + max_num_tokens=max_num_tokens_per_rank, + num_dispatchers=prepare_finalize.num_dispatchers(), + use_fp8_w8a8=True, + block_shape=self.quant_config.weight_block_size, + per_act_token_quant=( + self.input_quant.strategy == QuantizationStrategy.TOKEN), + ) + else: + return TritonExperts( + use_fp8_w8a8=True, + block_shape=self.quant_config.weight_block_size, + per_act_token_quant=( + self.input_quant.strategy == QuantizationStrategy.TOKEN), + ) + def apply( self, layer: torch.nn.Module, @@ -610,7 +647,9 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) if self.rocm_aiter_moe_enabled: return self.rocm_aiter_fused_experts_func( @@ -632,8 +671,6 @@ def apply( if self.use_marlin: assert activation == "silu", ( f"{activation} not supported for Marlin MoE.") - assert not apply_router_weight_on_input, ( - "Apply router weight on input not supported for Marlin MoE.") return torch.ops.vllm.fused_marlin_moe( x, layer.w13_weight, @@ -644,6 +681,7 @@ def apply( topk_weights, topk_ids, quant_type_id=scalar_types.float8_e4m3fn.id, + apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, expert_map=expert_map) @@ -701,6 +739,7 @@ def __init__( from vllm.model_executor.layers.fused_moe.cutlass_moe import ( cutlass_moe_fp8) + self.topk_indices_dtype = None self.fused_experts = cutlass_moe_fp8 # type: ignore self.disable_expert_map = False @@ -832,18 +871,25 @@ def select_gemm_impl( use_batched_format = (prepare_finalize.activation_format == FusedMoEActivationFormat.BatchedExperts) + num_dispatchers = prepare_finalize.num_dispatchers() + num_experts = (moe.num_local_experts if use_batched_format else moe.num_experts) + logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) + experts = CutlassExpertsFp8( num_experts, moe.in_dtype, self.input_quant.strategy == QuantizationStrategy.TOKEN, self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + num_dispatchers=num_dispatchers, use_batched_format=use_batched_format, ) - self.disable_expert_map = not experts.supports_expert_map() + self.disable_expert_map = (num_dispatchers > 1 + or not experts.supports_expert_map()) + return experts def apply( @@ -1312,8 +1358,6 @@ def apply( assert activation == "silu", ( f"{activation} not supported for Marlin MoE.") - assert not apply_router_weight_on_input, ( - "Apply router weight on input not supported for Marlin MoE.") topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, @@ -1337,6 +1381,7 @@ def apply( topk_weights, topk_ids, quant_type_id=self.quant_type.id, + apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, expert_map=expert_map, g_idx1=layer.w13_weight_g_idx, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py index 65cbc49d2640..8ba72162921a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional import torch diff --git a/vllm/model_executor/layers/quantization/deepgemm.py b/vllm/model_executor/layers/quantization/deepgemm.py index e4cf64740758..5903976eaf6b 100644 --- a/vllm/model_executor/layers/quantization/deepgemm.py +++ b/vllm/model_executor/layers/quantization/deepgemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging import torch diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index f879d0ad091a..5a1a427d7d72 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -473,12 +473,30 @@ def __init__(self, quant_config: Fp8Config): logger.warning_once( "DeepGemm not supported on the current platform.") + # Check for CutlassBlockScaledGroupedGemm support. + self.allow_cutlass_block_scaled_grouped_gemm = False + if not self.block_quant: + logger.warning_once("Model is not block quantized. Not using " + "CutlassBlockScaledGroupedGemm kernels") + elif (current_platform.is_cuda() + and current_platform.has_device_capability(100)): + logger.info_once( + "Using CutlassBlockScaledGroupedGemm kernels for Fp8MoEMethod." + ) + self.allow_cutlass_block_scaled_grouped_gemm = True + else: + logger.warning_once( + "CutlassBlockScaledGroupedGemm not supported on the current " + "platform.") + self.topk_indices_dtype = None self.fused_experts = functools.partial( # type: ignore fused_experts, use_fp8_w8a8=True, block_shape=self.quant_config.weight_block_size, - allow_deep_gemm=self.allow_deep_gemm) + allow_deep_gemm=self.allow_deep_gemm, + allow_cutlass_block_scaled_grouped_gemm=( + self.allow_cutlass_block_scaled_grouped_gemm)) def create_weights(self, layer: Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -802,10 +820,7 @@ def select_gemm_impl( self.quant_config.weight_block_size, False) return BatchedTritonOrDeepGemmExperts( max_num_tokens=max_num_tokens_per_rank, - world_size=prepare_finalize. - world_size, # type: ignore [attr-defined] - dp_size=prepare_finalize. - dp_size, # type: ignore [attr-defined] + num_dispatchers=prepare_finalize.num_dispatchers(), use_fp8_w8a8=True, block_shape=self.quant_config.weight_block_size, per_act_token_quant=False, @@ -892,8 +907,6 @@ def apply( elif self.use_marlin: assert activation == "silu", ( f"{activation} not supported for Marlin MoE.") - assert not apply_router_weight_on_input, ( - "Apply router weight on input not supported for Marlin MoE.") return torch.ops.vllm.fused_marlin_moe( x, layer.w13_weight, @@ -904,6 +917,7 @@ def apply( topk_weights, topk_ids, quant_type_id=scalar_types.float8_e4m3fn.id, + apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, expert_map=expert_map) else: diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 48ab04c9ab37..9bed5e2e4889 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -645,10 +645,6 @@ def apply( "EPLB not supported for `GPTQMarlinMoEMethod` yet.") assert activation == "silu", "Only SiLU activation is supported." - if apply_router_weight_on_input: - raise NotImplementedError( - "Apply router weight on input is not supported for " - "fused Marlin MoE method.") topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, @@ -672,6 +668,7 @@ def apply( topk_weights, topk_ids, quant_type_id=self.quant_type.id, + apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, expert_map=expert_map, g_idx1=layer.w13_g_idx, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index a10911b84afc..9db875330230 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -700,6 +700,7 @@ def apply( global_scale1=layer.w13_weight_scale_2, global_scale2=layer.w2_weight_scale_2, quant_type_id=scalar_types.float4_e2m1f.id, + apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, expert_map=expert_map) diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index 7e7fd6d51fd3..68309716cf90 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright © 2025, Oracle and/or its affiliates. import os diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index b9982f312fe5..0b62e744e445 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -104,8 +104,12 @@ def load_model(self, vllm_config: VllmConfig, if is_vllm_tensorized(self.tensorizer_config): tensorizer_config = self._patch_tensorizer_config(model_config) - model = init_tensorizer_model(tensorizer_config=tensorizer_config, - vllm_config=vllm_config) + device_config = vllm_config.device_config + with set_default_torch_dtype(model_config.dtype): + with torch.device(device_config.device): + model = init_tensorizer_model( + tensorizer_config=tensorizer_config, + vllm_config=vllm_config) self.load_weights(model, model_config) return model return self._load_model_serialized_cpu(vllm_config=vllm_config) diff --git a/vllm/model_executor/model_loader/tpu.py b/vllm/model_executor/model_loader/tpu.py index 6197bcdba826..b44c165397d0 100644 --- a/vllm/model_executor/model_loader/tpu.py +++ b/vllm/model_executor/model_loader/tpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from typing import Optional diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 4611f6704e19..78d86f6f2044 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -2,14 +2,17 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast import torch import torch.nn as nn +from vllm.model_executor.models.config import VerifyAndUpdateConfig + from .interfaces_base import VllmModelForPooling, is_pooling_model if TYPE_CHECKING: + from vllm.config import VllmConfig from vllm.model_executor.layers.pooler import PoolingType _T = TypeVar("_T", bound=type[nn.Module]) @@ -39,7 +42,6 @@ def _create_pooling_model_cls( default_softmax: bool, ) -> _T: # Lazy import - from vllm.config import VllmConfig from vllm.model_executor.layers.pooler import Pooler, PoolerOutput from vllm.model_executor.pooling_metadata import PoolingMetadata @@ -162,7 +164,6 @@ def as_seq_cls_model(cls: _T) -> _T: return cls # Lazy import - from vllm.config import VllmConfig from vllm.model_executor.layers.linear import RowParallelLinear from vllm.model_executor.layers.pooler import PoolerOutput, PoolingType from vllm.model_executor.models.interfaces import SupportsCrossEncoding @@ -193,6 +194,7 @@ def __init__( config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config + self.vllm_config = vllm_config self.task = vllm_config.model_config.task self.pooling_type = ( vllm_config.model_config.pooler_config.pooling_type) @@ -242,6 +244,17 @@ def get_logits(hidden_states): ] return PoolerOutput(outputs=pooled_outputs) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + tokens = getattr(self.config, "classifier_from_token", None) + method = getattr(self.config, "method", None) + + if tokens is None and method is None: + return super().load_weights(weights) + else: + # Online convert ForCausalLM into + # ForSequenceClassification model. + return seq_cls_model_loader(self, weights) + ModelForSequenceClassification.__name__ = \ _get_pooling_model_name(cls.__name__, "ForSequenceClassification") @@ -277,3 +290,86 @@ def as_reward_model(cls: _T) -> _T: _get_pooling_model_name(cls.__name__, "ForReward") return ModelForReward # type: ignore + + +class SequenceClassificationConfig(VerifyAndUpdateConfig): + + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + config = vllm_config.model_config.hf_config + method = getattr(config, "method", None) + tokens = getattr(config, "classifier_from_token", None) + + if method is None: + return + + assert tokens is not None + assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported" + + if method == "from_2_way_softmax": + assert len(tokens) == 2 + config.num_labels = 1 + else: + config.num_labels = len(tokens) + + +def load_weights_using_from_2_way_softmax( + model, weights: Iterable[tuple[str, torch.Tensor]]): + # refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3 + from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead) + from vllm.model_executor.models.utils import AutoWeightsLoader + + model_config = model.vllm_config.model_config + tokens = getattr(model.config, "classifier_from_token", []) + tokens = cast(list[int], tokens) + assert len(tokens) == 2 + + device = model.score.weight.device + + if model.config.tie_word_embeddings: + model.lm_head = model.model.embed_tokens + else: + model.lm_head = ParallelLMHead(model.config.vocab_size, + model.config.hidden_size, + quant_config=model.quant_config) + + loader = AutoWeightsLoader(model) + loaded_weights = loader.load_weights(weights) + + from vllm.transformers_utils.tokenizer import get_tokenizer + tokenizer = get_tokenizer(model_config.tokenizer, + revision=model_config.tokenizer_revision, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code) + + false_id = tokenizer.convert_tokens_to_ids(tokens[0]) + true_id = tokenizer.convert_tokens_to_ids(tokens[1]) + weight = model.lm_head.weight.data[true_id].to(device).to( + torch.float32) - model.lm_head.weight.data[false_id].to(device).to( + torch.float32) + model.score.weight.data.copy_(weight) + + del model.lm_head + loaded_weights.add("score.weight") + loaded_weights.discard("lm_head.weight") + return loaded_weights + + +SEQ_CLS_LOAD_METHODS = { + "from_2_way_softmax": load_weights_using_from_2_way_softmax, +} + + +def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]): + # Online convert ForCausalLM into ForSequenceClassification model. + # - from_2_way_softmax: + # - Qwen3ForCausalLM + # - Qwen3-Reranker + # - Qwen2ForCausalLM + # - mxbai-rerank-v2 + + config = model.vllm_config.model_config.hf_config + method = getattr(config, "method", None) + assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported" + return SEQ_CLS_LOAD_METHODS[method](model, weights) diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 5214c248a40e..45dd660c8937 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project Adapted from -# https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision from collections.abc import Iterable, Mapping, Sequence from typing import Literal, Optional, TypedDict, Union, cast diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 29e0e2a2edb1..d743c52074c6 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -9,6 +9,7 @@ from torch import nn from transformers import BambaConfig +from vllm import envs from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size @@ -36,7 +37,7 @@ from vllm.utils import LayerBlockType from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, - SupportsQuant, SupportsV0Only) + SupportsQuant) from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -97,7 +98,9 @@ def __init__(self, head_dim=config.mamba_d_head, rms_norm_eps=config.rms_norm_eps, activation=config.hidden_act, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.mixer", + chunk_size=config.mamba_chunk_size) self.feed_forward = BambaMLP(config, quant_config=quant_config) self.input_layernorm = RMSNorm(config.hidden_size, @@ -313,10 +316,14 @@ def forward( attn_metadata = get_forward_context().attn_metadata - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.mamba_chunk_size, - attn_metadata=attn_metadata, - ) + if not envs.VLLM_USE_V1: + mamba2_metadata = prepare_mamba2_metadata( + chunk_size=self.config.mamba_chunk_size, + attn_metadata=attn_metadata, + ) + else: + # v1 get mamba2_metadata from forward_context + mamba2_metadata = None if get_pp_group().is_first_rank: if inputs_embeds is not None: @@ -337,7 +344,8 @@ def forward( num_attn += 1 layer_mamba_cache_params = None - if isinstance(layer, BambaMixerDecoderLayer): + if isinstance(layer, + BambaMixerDecoderLayer) and mamba_cache_params: layer_mamba_cache_params = mamba_cache_params.at_layer_idx( i - num_attn) @@ -411,7 +419,7 @@ def load_weights(self, weights: Iterable[tuple[str, class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - IsHybrid, SupportsV0Only, SupportsQuant): + IsHybrid, SupportsQuant): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -475,15 +483,22 @@ def forward(self, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - if self.mamba_cache is None: - num_mamba_layers = self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, LayerBlockType.mamba) + mamba_cache_params = None + if not envs.VLLM_USE_V1: + if self.mamba_cache is None: + num_mamba_layers = \ + self.model_config.get_num_layers_by_block_type( + self.vllm_config.parallel_config, + LayerBlockType.mamba + ) + + self.mamba_cache = MambaCacheManager( + self.vllm_config, self.lm_head.weight.dtype, + num_mamba_layers, *self._get_mamba_cache_shape()) + + mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - self.mamba_cache = MambaCacheManager( - self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers, - *self._get_mamba_cache_shape()) - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) hidden_states = self.model(input_ids, positions, mamba_cache_params, intermediate_tensors, inputs_embeds) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 7b5345704ad0..552c4b074216 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -167,7 +167,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: assert tokens is not None and len(tokens) == 2, \ ("Try loading the original Qwen3 Reranker?, see: " "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py") - config.num_labels = 1 + vllm_config.model_config.hf_config.method = "from_2_way_softmax" class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 01a27d02a304..4bdcbfabbbc2 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 28f257eabed0..a76e1f256e04 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -8,6 +8,7 @@ from torch import nn from transformers import FalconH1Config +from vllm import envs from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size @@ -33,8 +34,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, - SupportsV0Only) +from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -85,6 +85,7 @@ def __init__( config: FalconH1Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.config = config @@ -107,6 +108,8 @@ def __init__( activation=config.hidden_act, quant_config=quant_config, use_rms_norm=config.mamba_rms_norm, + prefix=f"{prefix}.mixer", + chunk_size=config.mamba_chunk_size, ) # n_groups is overridden later by `MambaMixer2` self.groups_time_state_size = self.mamba.n_groups * config.mamba_d_state @@ -316,6 +319,7 @@ def __init__( prefix: str = "", ) -> None: super().__init__() + # Instantiate the attention branch self.self_attn = FalconH1AttentionDecoderLayer( config=config, @@ -323,11 +327,18 @@ def __init__( quant_config=quant_config, prefix=prefix, ) + + # In V1 all attention/ssm layers must have + # different index in prefix + ssm_layer_idx = config.num_hidden_layers + layer_idx + ssm_prefix = prefix.split(".")[0] + f".{ssm_layer_idx}" + # Instantiate the SSM branch self.mamba = FalconH1SSMDecoderLayer( config=config, cache_config=cache_config, quant_config=quant_config, + prefix=ssm_prefix, ) self.ssm_out_multiplier = config.ssm_out_multiplier self.ssm_in_multiplier = config.ssm_in_multiplier @@ -452,10 +463,16 @@ def forward( # proper continuous batching computation including # chunked prefill attn_metadata = get_forward_context().attn_metadata - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.mamba_chunk_size, - attn_metadata=attn_metadata, - ) + + if not envs.VLLM_USE_V1: + mamba2_metadata = prepare_mamba2_metadata( + chunk_size=self.config.mamba_chunk_size, + attn_metadata=attn_metadata, + ) + else: + # v1 get mamba2_metadata from forward_context + mamba2_metadata = None + if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds * self.embedding_multiplier @@ -468,7 +485,9 @@ def forward( for i in range(self.start_layer, self.end_layer): layer = self.layers[i] - layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i) + layer_mamba_cache_params = None + if mamba_cache_params: + layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i) hidden_states = layer( positions=positions, hidden_states=hidden_states, @@ -484,7 +503,7 @@ def forward( class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - IsHybrid, SupportsV0Only): + IsHybrid): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], @@ -558,15 +577,19 @@ def forward( inputs_embeds: Optional[torch.Tensor] = None, **kwargs, ): - if self.mamba_cache is None: - self.mamba_cache = MambaCacheManager( - self.vllm_config, - self.lm_head.weight.dtype - if hasattr(self.lm_head, 'weight') else torch.bfloat16, - self.config.num_hidden_layers, - *self._get_mamba_cache_shape(), - ) - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) + + mamba_cache_params = None + if not envs.VLLM_USE_V1: + if self.mamba_cache is None: + self.mamba_cache = MambaCacheManager( + self.vllm_config, + self.lm_head.weight.dtype if hasattr( + self.lm_head, 'weight') else torch.bfloat16, + self.config.num_hidden_layers, + *self._get_mamba_cache_shape(), + ) + mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) + hidden_states = self.model( input_ids, positions, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 303cbdb25945..a3908e30ec6e 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/main/src/transformers/models/Glm4v/modeling_Glm4v.py @@ -55,9 +56,6 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.gptq import GPTQConfig -from vllm.model_executor.layers.quantization.gptq_marlin import ( - GPTQMarlinConfig) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY @@ -179,6 +177,7 @@ def __init__( hidden_features: int, bias: bool = False, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.gate_up_proj = MergedColumnParallelLinear( @@ -186,13 +185,12 @@ def __init__( output_sizes=[hidden_features] * 2, bias=bias, quant_config=quant_config, - ) - self.down_proj = RowParallelLinear( - hidden_features, - in_features, - bias=bias, - quant_config=quant_config, - ) + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") self.act_fn = SiluAndMul() def forward(self, x: torch.Tensor): @@ -407,6 +405,7 @@ def __init__( mlp_hidden_dim, bias=False, quant_config=quant_config, + prefix=f"{prefix}.mlp", ) def forward( @@ -1278,7 +1277,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.visual = Glm4vVisionTransformer( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-5), - quant_config=self._maybe_ignore_quant_config(quant_config), + quant_config=quant_config, prefix=maybe_prefix(prefix, "visual"), ) @@ -1291,13 +1290,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): - # GPTQ configs do not have a list of ignored modules, however AutoGPTQ - # seems to avoid vision encoder sections for some models. - if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): - return None - return quant_config - def _validate_and_reshape_mm_tensor(self, mm_input: object, name: str) -> torch.Tensor: if not isinstance(mm_input, (torch.Tensor, list)): diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 33e8626209d5..676ef24fc4da 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -9,6 +9,7 @@ from torch import nn from transformers import GraniteMoeHybridConfig +from vllm import envs from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size @@ -36,7 +37,7 @@ from .granitemoe import GraniteMoeMoE from .granitemoeshared import GraniteMoeSharedMLP from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, - SupportsQuant, SupportsV0Only) + SupportsQuant) from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -67,7 +68,9 @@ def __init__(self, head_dim=config.mamba_d_head, rms_norm_eps=config.rms_norm_eps, activation=config.hidden_act, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.mixer", + chunk_size=config.mamba_chunk_size) self.block_sparse_moe = None if getattr(config, "num_local_experts", 0) > 0: @@ -361,10 +364,15 @@ def forward( ) -> torch.Tensor: attn_metadata = get_forward_context().attn_metadata - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.mamba_chunk_size, - attn_metadata=attn_metadata, - ) + + if not envs.VLLM_USE_V1: + mamba2_metadata = prepare_mamba2_metadata( + chunk_size=self.config.mamba_chunk_size, + attn_metadata=attn_metadata, + ) + else: + # v1 get mamba2_metadata from forward_context + mamba2_metadata = None if get_pp_group().is_first_rank: if inputs_embeds is not None: @@ -386,7 +394,9 @@ def forward( num_attn += 1 layer_mamba_cache_params = None - if isinstance(layer, GraniteMoeHybridMambaDecoderLayer): + if isinstance( + layer, + GraniteMoeHybridMambaDecoderLayer) and mamba_cache_params: layer_mamba_cache_params = mamba_cache_params.at_layer_idx( i - num_attn) @@ -501,8 +511,7 @@ def _load_expert(n, p, name, shard_id, expert_id): class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, - SupportsPP, IsHybrid, SupportsV0Only, - SupportsQuant): + SupportsPP, IsHybrid, SupportsQuant): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -571,14 +580,20 @@ def forward(self, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - if self.mamba_cache is None: - num_mamba_layers = self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, LayerBlockType.mamba) - self.mamba_cache = MambaCacheManager( - self.vllm_config, self.model_config.dtype, num_mamba_layers, - *self._get_mamba_cache_shape()) - - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) + + mamba_cache_params = None + if not envs.VLLM_USE_V1: + if self.mamba_cache is None: + num_mamba_layers = ( + self.model_config.get_num_layers_by_block_type( + self.vllm_config.parallel_config, + LayerBlockType.mamba)) + self.mamba_cache = MambaCacheManager( + self.vllm_config, self.model_config.dtype, + num_mamba_layers, *self._get_mamba_cache_shape()) + + mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) + hidden_states = self.model(input_ids, positions, mamba_cache_params, intermediate_tensors, inputs_embeds) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 3424efa80d48..5d51b01df9db 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -23,6 +23,7 @@ import torch from torch import nn +from vllm import envs from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size @@ -44,8 +45,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, - SupportsQuant, - SupportsV0Only) + SupportsQuant) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) from vllm.model_executor.models.utils import ( @@ -153,6 +153,8 @@ def __init__( rms_norm_eps=config.rms_norm_eps, activation=config.mamba_hidden_act, quant_config=quant_config, + prefix=f"{prefix}.mixer", + chunk_size=config.chunk_size, ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -348,10 +350,14 @@ def forward( attn_metadata = get_forward_context().attn_metadata - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.chunk_size, - attn_metadata=attn_metadata, - ) + if not envs.VLLM_USE_V1: + mamba2_metadata = prepare_mamba2_metadata( + chunk_size=self.config.chunk_size, + attn_metadata=attn_metadata, + ) + else: + # v1 get mamba2_metadata from forward_context + mamba2_metadata = None if get_pp_group().is_first_rank: if inputs_embeds is not None: @@ -369,7 +375,8 @@ def forward( for i in range(len(self.layers)): layer = self.layers[i] layer_mamba_cache_params = None - if isinstance(layer, NemotronHMambaDecoderLayer): + if isinstance(layer, + NemotronHMambaDecoderLayer) and mamba_cache_params: layer_mamba_cache_params = mamba_cache_params.at_layer_idx( i - num_non_mamba_layers) else: @@ -437,7 +444,7 @@ def load_weights(self, weights: Iterable[tuple[str, class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - IsHybrid, SupportsV0Only, SupportsQuant): + IsHybrid, SupportsQuant): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -499,15 +506,23 @@ def forward(self, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - if self.mamba_cache is None: - num_mamba_layers = self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, LayerBlockType.mamba) + mamba_cache_params = None + if not envs.VLLM_USE_V1: + if self.mamba_cache is None: + + num_mamba_layers = \ + self.model_config.get_num_layers_by_block_type( + self.vllm_config.parallel_config, + LayerBlockType.mamba + ) + + self.mamba_cache = MambaCacheManager( + self.vllm_config, self.lm_head.weight.dtype, + num_mamba_layers, *self._get_mamba_cache_shape()) + + mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) - self.mamba_cache = MambaCacheManager( - self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers, - *self._get_mamba_cache_shape()) - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) hidden_states = self.model(input_ids, positions, mamba_cache_params, intermediate_tensors, inputs_embeds) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 1224ba7abc75..de99a76f2897 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -38,15 +38,14 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.sequence import IntermediateTensors -from .interfaces import SupportsCrossEncoding, SupportsLoRA, SupportsPP +from .adapters import as_seq_cls_model +from .interfaces import SupportsLoRA, SupportsPP from .qwen2 import Qwen2MLP as Qwen3MLP from .qwen2 import Qwen2Model from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix @@ -323,114 +322,4 @@ def load_weights(self, weights: Iterable[tuple[str, return loader.load_weights(weights) -class Qwen3ForSequenceClassification(nn.Module, SupportsLoRA, - SupportsCrossEncoding): - - def __init__( - self, - vllm_config: "VllmConfig", - prefix: str = "", - ) -> None: - super().__init__() - - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - pooler_config = vllm_config.model_config.pooler_config - - self.vllm_config = vllm_config - self.config = config - self.quant_config = quant_config - self.prefix = prefix - self.model = Qwen3Model(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - self.score = RowParallelLinear(config.hidden_size, - config.num_labels, - quant_config=quant_config, - input_is_parallel=False, - bias=False, - prefix=maybe_prefix(prefix, "score")) - - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=False, - softmax=True) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - return self.model(input_ids=input_ids, - positions=positions, - inputs_embeds=inputs_embeds, - intermediate_tensors=intermediate_tensors) - - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> Optional[PoolerOutput]: - hidden_states = self._pooler.extract_states(hidden_states, - pooling_metadata) - - if isinstance(hidden_states, list): - logits = [self.score(state)[0] for state in hidden_states] - else: - logits, _ = self.score(hidden_states) - - pooled_data = self._pooler.head(logits, pooling_metadata) - pooled_outputs = [ - self._pooler.build_output(data.squeeze(-1)) for data in pooled_data - ] - return PoolerOutput(outputs=pooled_outputs) - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - is_original_qwen3_reranker = getattr(self.config, - "is_original_qwen3_reranker", - False) - - if not is_original_qwen3_reranker: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) - - return self.load_weights_from_original_qwen3_reranker(weights) - - def load_weights_from_original_qwen3_reranker( - self, weights: Iterable[tuple[str, torch.Tensor]]): - - model_config = self.vllm_config.model_config - tokens = getattr(self.config, "classifier_from_token", None) - device = self.score.weight.device - - if self.config.tie_word_embeddings: - self.lm_head = self.model.embed_tokens - else: - self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size, - quant_config=self.quant_config, - prefix=maybe_prefix( - self.prefix, "lm_head")) - - loader = AutoWeightsLoader(self) - loaded_weights = loader.load_weights(weights) - - from vllm.transformers_utils.tokenizer import get_tokenizer - tokenizer = get_tokenizer( - model_config.tokenizer, - revision=model_config.tokenizer_revision, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code) - - a = tokenizer.convert_tokens_to_ids(tokens[0]) - b = tokenizer.convert_tokens_to_ids(tokens[1]) - weight = self.lm_head.weight.data[b].to( - device) - self.lm_head.weight.data[a].to(device) - self.score.weight.data.copy_(weight) - - del self.lm_head - loaded_weights.add("score.weight") - loaded_weights.discard("lm_head.weight") - return loaded_weights +Qwen3ForSequenceClassification = as_seq_cls_model(Qwen3ForCausalLM) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 90a28192eccb..ff182aadf738 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -135,7 +135,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: router_logits, _ = self.gate(hidden_states) final_hidden_states = self.experts(hidden_states=hidden_states, router_logits=router_logits) - final_hidden_states = final_hidden_states + if self.tp_size > 1: final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 final_hidden_states) diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index c2d3d0114c23..25f026e9bef8 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 27b3c75513fb..344d6fc8f452 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -780,8 +780,9 @@ def validate_language(cls, language: str) -> bool: @classmethod def get_decoder_prompt(cls, language: str, task_type: str, prompt: str) -> str: - return (f"<|startoftranscript|><|{language}|><|{task_type}|>" - f"<|notimestamps|>{prompt}") + return ((f"<|prev|>{prompt}" if prompt else "") + + f"<|startoftranscript|><|{language}|>" + + f"<|{task_type}|><|notimestamps|>") @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index a4f97c774f70..54c80cfa5922 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -15,6 +15,7 @@ from torch import nn from transformers import Zamba2Config +from vllm import envs from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size @@ -41,7 +42,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import HasInnerState, IsHybrid, SupportsV0Only +from .interfaces import HasInnerState, IsHybrid from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -58,6 +59,7 @@ def __init__( rank: int, output_dim: Union[int, list[int]], quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): """Initialize the attention layer. @@ -283,6 +285,7 @@ def __init__( bare_block_idx: int, num_hybrid_layers: dict[int, int], quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: """Initialize the MLP layer. @@ -471,11 +474,10 @@ class Zamba2MambaDecoderLayer(nn.Module): computation depending on configuration. """ - def __init__( - self, - config: Zamba2Config, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: + def __init__(self, + config: Zamba2Config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: """Initialize the Mamba decoder layer. Args: @@ -486,20 +488,21 @@ def __init__( # Initialize Mamba mixer with expanded intermediate size intermediate_size = config.mamba_expand * config.hidden_size - self.mamba = MambaMixer2( - hidden_size=config.hidden_size, - ssm_state_size=config.mamba_d_state, - conv_kernel_size=config.mamba_d_conv, - intermediate_size=intermediate_size, - use_conv_bias=config.use_conv_bias, - use_bias=config.add_bias_linear, - n_groups=config.mamba_ngroups, - num_heads=config.n_mamba_heads, - head_dim=intermediate_size // config.n_mamba_heads, - rms_norm_eps=config.rms_norm_eps, - activation="silu", - quant_config=quant_config, - ) + self.mamba = MambaMixer2(hidden_size=config.hidden_size, + ssm_state_size=config.mamba_d_state, + conv_kernel_size=config.mamba_d_conv, + intermediate_size=intermediate_size, + use_conv_bias=config.use_conv_bias, + use_bias=config.add_bias_linear, + n_groups=config.mamba_ngroups, + num_heads=config.n_mamba_heads, + head_dim=intermediate_size // + config.n_mamba_heads, + rms_norm_eps=config.rms_norm_eps, + activation="silu", + quant_config=quant_config, + prefix=f"{prefix}.mixer", + chunk_size=config.chunk_size) # Input normalization self.input_layernorm = RMSNorm(config.hidden_size, @@ -573,6 +576,7 @@ def __init__( config: Zamba2Config, block_idx: int, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: """Initialize the hybrid layer. @@ -589,7 +593,8 @@ def __init__( bias=False, quant_config=quant_config) self.mamba_decoder = Zamba2MambaDecoderLayer(config, - quant_config=quant_config) + quant_config=quant_config, + prefix=prefix) def forward( self, @@ -699,14 +704,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: # Initialize layers according to block type configuration layers = [] for layer_idx, layer_type in enumerate(config.layers_block_type): + # tdoublep: avoid layers getting same index + # somewhat hacky but correct (I think) + prefix = str(len(layer2block_map) + layer_idx) if layer_type == "hybrid": block = next(blocks) block_idx = layer2block_map[layer_idx] layers.append( - Zamba2HybridLayer(block, config, block_idx, quant_config)) + Zamba2HybridLayer(block, + config, + block_idx, + quant_config, + prefix=prefix)) else: layers.append( - Zamba2MambaDecoderLayer(config, quant_config=quant_config)) + Zamba2MambaDecoderLayer(config, + quant_config=quant_config, + prefix=prefix)) self.layers = nn.ModuleList(layers) # Final layer normalization @@ -751,19 +765,30 @@ def forward( attn_metadata = get_forward_context().attn_metadata - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.chunk_size, - attn_metadata=attn_metadata, - ) + if not envs.VLLM_USE_V1: + mamba2_metadata = prepare_mamba2_metadata( + chunk_size=self.config.chunk_size, + attn_metadata=attn_metadata, + ) + else: + # v1 get mamba2_metadata from forward_context + mamba2_metadata = None # Process through layers original_hidden_states = torch.clone(hidden_states) for layer_idx, layer in enumerate(self.layers): + + layer_mamba_cache_params = None + if (isinstance(layer, (Zamba2HybridLayer, Zamba2MambaDecoderLayer)) + and mamba_cache_params): + layer_mamba_cache_params = mamba_cache_params.at_layer_idx( + layer_idx) + layer_outputs = layer( hidden_states, original_hidden_states=original_hidden_states, positions=positions, - mamba_cache_params=mamba_cache_params.at_layer_idx(layer_idx), + mamba_cache_params=layer_mamba_cache_params, mamba2_metadata=mamba2_metadata, ) hidden_states = layer_outputs @@ -803,7 +828,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): +class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): """Zamba2 model with causal language modeling head. This class wraps the core Zamba2 model and adds: @@ -897,14 +922,16 @@ def forward(self, Output hidden states """ # Initialize Mamba cache if needed - if self.mamba_cache is None: - num_mamba_layers = self.config.num_hidden_layers - self.mamba_cache = MambaCacheManager( - self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers, - *self._get_mamba_cache_shape()) - - # Get cache parameters for current run - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) + mamba_cache_params = None + if not envs.VLLM_USE_V1: + if self.mamba_cache is None: + num_mamba_layers = self.config.num_hidden_layers + self.mamba_cache = MambaCacheManager( + self.vllm_config, self.lm_head.weight.dtype, + num_mamba_layers, *self._get_mamba_cache_shape()) + + # Get cache parameters for current run + mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) # Forward pass through model hidden_states = self.model( diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 22e696141b84..8dfbc6503520 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -43,6 +43,16 @@ def __init__( *, allowed_local_media_path: str = "", ) -> None: + """ + Args: + media_io_kwargs: Additional args passed to process media + inputs, keyed by modalities. For example, + to set num_frames for video, set + `--media-io-kwargs '{"video":{"num_frames":40}}'` + connection: HTTP connection client to download media contents. + allowed_local_media_path: A local directory to load media files + from. + """ super().__init__() self.media_io_kwargs: dict[str, dict[ @@ -277,15 +287,6 @@ def fetch_image_embedding( return image_embedding_io.load_base64("", data) -global_media_connector = MediaConnector() -"""The global [`MediaConnector`][vllm.multimodal.utils.MediaConnector] -instance used by vLLM.""" - -fetch_audio = global_media_connector.fetch_audio -fetch_image = global_media_connector.fetch_image -fetch_video = global_media_connector.fetch_video - - def encode_audio_base64( audio: np.ndarray, sampling_rate: float, @@ -441,3 +442,51 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor, dim=0) vision_embeddings = vision_embeddings[:num_chunks, ...] return vision_embeddings + + +def fetch_audio( + audio_url: str, + audio_io_kwargs: Optional[dict[str, Any]] = None, +) -> tuple[np.ndarray, Union[int, float]]: + """ + Args: + audio_url: URL of the audio file to fetch. + audio_io_kwargs: Additional kwargs passed to handle audio IO. + """ + media_io_kwargs = None if not audio_io_kwargs else { + "audio": audio_io_kwargs + } + media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + return media_connector.fetch_audio(audio_url) + + +def fetch_image( + image_url: str, + image_io_kwargs: Optional[dict[str, Any]] = None, +) -> Image.Image: + """ + Args: + image_url: URL of the image file to fetch. + image_io_kwargs: Additional kwargs passed to handle image IO. + """ + media_io_kwargs = None if not image_io_kwargs else { + "image": image_io_kwargs + } + media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + return media_connector.fetch_image(image_url) + + +def fetch_video( + video_url: str, + video_io_kwargs: Optional[dict[str, Any]] = None, +) -> tuple[npt.NDArray, dict[str, Any]]: + """ + Args: + video_url: URL of the video file to fetch. + video_io_kwargs: Additional kwargs passed to handle video IO. + """ + media_io_kwargs = None if not video_io_kwargs else { + "video": video_io_kwargs + } + media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + return media_connector.fetch_video(video_url) \ No newline at end of file diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 15cab757d2c0..f82c1e569977 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -251,6 +251,10 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype, # Default backends for V1 engine # Prefer FlashInfer for Blackwell GPUs if installed + if dtype not in (torch.float16, torch.bfloat16): + logger.info_once( + f"Using FlexAttenion backend for {dtype} on V1 engine.") + return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501 if cls.is_device_capability(100): try: import flashinfer # noqa: F401 diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 0f08bf986333..567d5cbf503f 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -4,6 +4,7 @@ import os import platform import random +import sys from datetime import timedelta from platform import uname from typing import TYPE_CHECKING, NamedTuple, Optional, Union @@ -164,6 +165,9 @@ def is_neuron(self) -> bool: def is_out_of_tree(self) -> bool: return self._enum == PlatformEnum.OOT + def get_max_output_tokens(self, prompt_len: int) -> int: + return sys.maxsize + def is_cuda_alike(self) -> bool: """Stateless version of [torch.cuda.is_available][].""" return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 701bb38810f6..5c422a9e3fce 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -33,10 +33,8 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, DbrxConfig, DeepseekVLV2Config, EAGLEConfig, ExaoneConfig, - H2OVLChatConfig, - InternVLChatConfig, JAISConfig, - KimiVLConfig, MedusaConfig, - MiniMaxText01Config, + JAISConfig, KimiVLConfig, + MedusaConfig, MiniMaxText01Config, MiniMaxVL01Config, MllamaConfig, MLPSpeculatorConfig, MPTConfig, NemotronConfig, NVLM_D_Config, @@ -90,8 +88,6 @@ def _get_hf_token() -> Optional[str]: "medusa": MedusaConfig, "eagle": EAGLEConfig, "exaone": ExaoneConfig, - "h2ovl_chat": H2OVLChatConfig, - "internvl_chat": InternVLChatConfig, "minimax_text_01": MiniMaxText01Config, "minimax_vl_01": MiniMaxVL01Config, "nemotron": NemotronConfig, @@ -104,6 +100,10 @@ def _get_hf_token() -> Optional[str]: **_CONFIG_REGISTRY_OVERRIDE_HF } +_CONFIG_ATTRS_MAPPING: dict[str, str] = { + "llm_config": "text_config", +} + class ConfigFormat(str, enum.Enum): AUTO = "auto" @@ -286,6 +286,18 @@ def is_encoder_decoder(config: PretrainedConfig) -> bool: return getattr(config, "is_encoder_decoder", False) +def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: + """Remap config attributes to match the expected names.""" + for old_attr, new_attr in _CONFIG_ATTRS_MAPPING.items(): + if hasattr(config, old_attr): + if not hasattr(config, new_attr): + config.update({new_attr: getattr(config, old_attr)}) + delattr(config, old_attr) + logger.debug("Remapped config attribute '%s' to '%s'", old_attr, + new_attr) + return config + + def get_config( model: Union[str, Path], trust_remote_code: bool, @@ -361,6 +373,9 @@ def get_config( revision=revision, code_revision=code_revision, token=_get_hf_token(), + # some old custom model's config needs + # `has_no_defaults_at_init=True` to work. + has_no_defaults_at_init=trust_remote_code, **kwargs, ) except ValueError as e: @@ -376,6 +391,7 @@ def get_config( raise RuntimeError(err_msg) from e else: raise e + config = _maybe_remap_hf_config_attrs(config) elif config_format == ConfigFormat.MISTRAL: config = load_params_config(model, revision, **kwargs) @@ -639,34 +655,35 @@ class module does not need to be importable on the receiving end. """ # noqa try: import transformers_modules + transformers_modules_available = True except ImportError: - # the config does not need trust_remote_code - return + transformers_modules_available = False try: - import cloudpickle - cloudpickle.register_pickle_by_value(transformers_modules) - - # ray vendors its own version of cloudpickle - from vllm.executor.ray_utils import ray - if ray: - ray.cloudpickle.register_pickle_by_value(transformers_modules) - - # multiprocessing uses pickle to serialize arguments when using spawn - # Here we get pickle to use cloudpickle to serialize config objects - # that contain instances of the custom config class to avoid - # serialization problems if the generated module (and model) has a `.` - # in its name import multiprocessing import pickle + import cloudpickle + from vllm.config import VllmConfig + # Register multiprocessing reducers to handle cross-process + # serialization of VllmConfig objects that may contain custom configs + # from transformers_modules def _reduce_config(config: VllmConfig): return (pickle.loads, (cloudpickle.dumps(config), )) multiprocessing.reducer.register(VllmConfig, _reduce_config) + # Register transformers_modules with cloudpickle if available + if transformers_modules_available: + cloudpickle.register_pickle_by_value(transformers_modules) + + # ray vendors its own version of cloudpickle + from vllm.executor.ray_utils import ray + if ray: + ray.cloudpickle.register_pickle_by_value(transformers_modules) + except Exception as e: logger.warning( "Unable to register remote classes used by" diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 97a1b683a9b8..734f1e09d0fd 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -11,8 +11,6 @@ # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. from vllm.transformers_utils.configs.falcon import RWConfig -from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig -from vllm.transformers_utils.configs.internvl import InternVLChatConfig from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig from vllm.transformers_utils.configs.medusa import MedusaConfig @@ -38,8 +36,6 @@ "DeepseekVLV2Config", "MPTConfig", "RWConfig", - "H2OVLChatConfig", - "InternVLChatConfig", "JAISConfig", "MedusaConfig", "EAGLEConfig", diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py deleted file mode 100644 index b36a6dd59d3d..000000000000 --- a/vllm/transformers_utils/configs/h2ovl.py +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Adapted from -# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py -# -------------------------------------------------------- -# H2OVL-Mississippi -# Copyright (c) 2024 H2O.AI -# Licensed under Apache 2.0 License [see LICENSE for details] -# -------------------------------------------------------- - -from .internvl import InternVLChatConfig - - -class H2OVLChatConfig(InternVLChatConfig): - model_type = "h2ovl_chat" diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py deleted file mode 100644 index 4494ebfef667..000000000000 --- a/vllm/transformers_utils/configs/internvl.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Adapted from -# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py -# -------------------------------------------------------- -# InternVL -# Copyright (c) 2024 OpenGVLab -# Licensed under The MIT License [see LICENSE for details] -# -------------------------------------------------------- -from transformers.configuration_utils import PretrainedConfig - - -class InternVLChatConfig(PretrainedConfig): - model_type = 'internvl_chat' - is_composition = True - - def __init__(self, - vision_config=None, - llm_config=None, - use_backbone_lora=0, - use_llm_lora=0, - select_layer=-1, - force_image_size=None, - downsample_ratio=0.5, - template=None, - dynamic_image_size=False, - use_thumbnail=False, - ps_version='v1', - min_dynamic_patch=1, - max_dynamic_patch=6, - **kwargs): - super().__init__(**kwargs) - - if vision_config is None: - vision_config = {} - - if llm_config is None: - llm_config = {} - - self.vision_config = PretrainedConfig(**vision_config) - self.text_config = PretrainedConfig(**llm_config) - - self.use_backbone_lora = use_backbone_lora - self.use_llm_lora = use_llm_lora - self.select_layer = select_layer - self.force_image_size = force_image_size - self.downsample_ratio = downsample_ratio - self.template = template - self.dynamic_image_size = dynamic_image_size - self.use_thumbnail = use_thumbnail - self.ps_version = ps_version # pixel shuffle version - self.min_dynamic_patch = min_dynamic_patch - self.max_dynamic_patch = max_dynamic_patch diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py index 9fe75f2dfeea..457b3371e90d 100644 --- a/vllm/transformers_utils/configs/nemotron_h.py +++ b/vllm/transformers_utils/configs/nemotron_h.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 HuggingFace Inc. team. All rights reserved. # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py index a533720af6c6..edfc506882ff 100644 --- a/vllm/transformers_utils/configs/nvlm_d.py +++ b/vllm/transformers_utils/configs/nvlm_d.py @@ -8,8 +8,24 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from .internvl import InternVLChatConfig +from transformers import Qwen2Config +from transformers.configuration_utils import PretrainedConfig -class NVLM_D_Config(InternVLChatConfig): +class NVLM_D_Config(PretrainedConfig): model_type = 'NVLM_D' + is_composition = True + + def __init__(self, vision_config=None, llm_config=None, **kwargs): + super().__init__(**kwargs) + + # Handle vision_config initialization + if vision_config is None: + vision_config = {} + + # Handle llm_config initialization + if llm_config is None: + llm_config = {} + + self.vision_config = PretrainedConfig(**vision_config) + self.text_config = Qwen2Config(**llm_config) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index c149637635b7..92245498de65 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -20,9 +20,12 @@ import vllm.envs as envs from vllm.connections import global_http_connection +from vllm.logger import init_logger from vllm.utils import cuda_device_count_stateless, cuda_get_device_properties from vllm.version import __version__ as VLLM_VERSION +logger = init_logger(__name__) + _config_home = envs.VLLM_CONFIG_ROOT _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json") _USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track") @@ -183,7 +186,7 @@ def _report_usage_once(self, model_architecture: str, self.gpu_memory_per_device = ( torch_xla.core.xla_model.get_memory_info()["bytes_limit"]) except Exception: - pass + logger.exception("Failed to collect TPU information") self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index d97d873ccfdd..9550b056fbba 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -46,7 +46,7 @@ from functools import cache, lru_cache, partial, wraps from types import MappingProxyType from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, - Optional, TypeVar, Union, cast, overload) + Optional, Tuple, TypeVar, Union, cast, overload) from urllib.parse import urlparse from uuid import uuid4 @@ -628,14 +628,34 @@ def is_valid_ipv6_address(address: str) -> bool: return False +def split_host_port(host_port: str) -> Tuple[str, int]: + # ipv6 + if host_port.startswith('['): + host, port = host_port.rsplit(']', 1) + host = host[1:] + port = port.split(':')[1] + return host, int(port) + else: + host, port = host_port.split(':') + return host, int(port) + + +def join_host_port(host: str, port: int) -> str: + if is_valid_ipv6_address(host): + return f"[{host}]:{port}" + else: + return f"{host}:{port}" + + def get_distributed_init_method(ip: str, port: int) -> str: return get_tcp_uri(ip, port) def get_tcp_uri(ip: str, port: int) -> str: - # Brackets are not permitted in ipv4 addresses, - # see https://github.com/python/cpython/issues/103848 - return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}" + if is_valid_ipv6_address(ip): + return f"tcp://[{ip}]:{port}" + else: + return f"tcp://{ip}:{port}" def get_open_zmq_ipc_path() -> str: @@ -2779,7 +2799,7 @@ def find_unimplemented_methods(self: object): if unimplemented_methods: method_names = ','.join(unimplemented_methods) msg = (f"Methods {method_names} not implemented in {self}") - logger.warning(msg) + logger.debug(msg) @wraps(original_init) def wrapped_init(self, *args, **kwargs) -> None: @@ -2985,4 +3005,4 @@ def has_deep_ep() -> bool: def has_deep_gemm() -> bool: """Whether the optional `deep_gemm` package is available.""" - return _has_module("deep_gemm") \ No newline at end of file + return _has_module("deep_gemm") diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 72b2d09fa1f1..e493f1b8088b 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -1,17 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 -from dataclasses import dataclass -from typing import Any, Optional - +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy as np import torch -from torch.nn.functional import scaled_dot_product_attention -from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer, - AttentionMetadata, AttentionType, - is_quantized_kv_cache) +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl, + TorchSDPAMetadata) from vllm.attention.backends.utils import CommonAttentionState -from vllm.logger import init_logger -from vllm.multimodal import MultiModalPlaceholderMap +from vllm.attention.ops.ipex_attn import PagedAttention from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, CommonAttentionMetadata) from vllm.v1.core.sched.output import SchedulerOutput @@ -20,17 +16,6 @@ from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.gpu_input_batch import InputBatch -try: - import intel_extension_for_pytorch.llm.modules as ipex_modules - _use_ipex = True -# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813 -except (ImportError, AttributeError): - _use_ipex = False - -from vllm import _custom_ops as ops - -logger = init_logger(__name__) - class TorchSDPABackend: accept_output_buffer: bool = False @@ -62,265 +47,14 @@ def get_kv_cache_shape( num_kv_heads: int, head_size: int, ) -> tuple[int, ...]: - return _get_paged_attn_impl().get_kv_cache_shape( # type: ignore - num_blocks, block_size, num_kv_heads, head_size) + return PagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) @staticmethod def use_cascade_attention(*args, **kwargs) -> bool: return False -@dataclass -class TorchSDPAMetadata(AttentionMetadata): - """Attention metadata for prefill and decode batched together.""" - # Total number of prefill requests. - num_prefills: int - # Number of prefill tokens. - num_prefill_tokens: int - # Number of decode tokens. Note that it is equivalent to the number of - # decode requests. - num_decode_tokens: int - # (num_tokens,). The indices of the token slots that input tokens will be - # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size - # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot - # in block 0, and 1st slot in block 1, respectively. - slot_mapping: torch.Tensor - - # The index maps that relate multi-modal embeddings to the corresponding - # placeholders. - # - # N.B. These aren't really related to attention and don't belong on this - # type -- this is just a temporary solution to make them available to - # `model_executable`. - multi_modal_placeholder_index_maps: Optional[dict[ - str, MultiModalPlaceholderMap.IndexMap]] - - # Enable/disable KV scales calculation. This is so that we can disable the - # calculation until after prefill and cuda graph capture. - enable_kv_scales_calculation: bool - """Metadata for PagedAttention.""" - # (batch_size,). The length of sequences (entire tokens seen so far) per - # sequence. - seq_lens_tensor: Optional[torch.Tensor] - # Maximum sequence length in the batch. 0 if it is prefill-only batch. - max_decode_seq_len: int - # (batch_size, max_blocks_per_seq). - # Block addresses per sequence. (Seq id -> list of physical block) - # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks - # in the kv cache. Each block can contain up to block_size tokens. - # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph - # captured. - block_tables: Optional[torch.Tensor] - """Metadata for TorchSDPABackend. - """ - # Currently, input sequences can only contain all prompts - # or all decoding. True if all sequences are prompts. - chunked_prefill: bool - seq_lens: Optional[list[int]] = None # For non-chunked prefill - - # For chunked prefill only - max_query_len: Optional[int] = None - max_kv_len: Optional[int] = None - prefill_query_start_loc: Optional[torch.Tensor] = None - kv_start_loc: Optional[torch.Tensor] = None - prefill_block_tables: Optional[torch.Tensor] = None - - # For V1 logits index only - query_start_loc: Optional[torch.Tensor] = None - - # Begin encoder attn & enc/dec cross-attn fields... - # Encoder sequence lengths representation - encoder_seq_lens: Optional[list[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - def __post_init__(self): - # Set during the execution of the first attention op. - # It is a list because it is needed to set per prompt - # when alibi slopes is used. It is because of the limitation - # from xformer API. - # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[list[torch.Tensor]] = None - self.encoder_attn_bias: Optional[list[torch.Tensor]] = None - self.cross_attn_bias: Optional[list[torch.Tensor]] = None - - @property - def is_all_encoder_attn_metadata_set(self): - ''' - All attention metadata required for encoder attention is set. - ''' - return ((self.encoder_seq_lens is not None) - and (self.encoder_seq_lens_tensor is not None) - and (self.max_encoder_seq_len is not None)) - - @property - def is_all_cross_attn_metadata_set(self): - ''' - All attention metadata required for enc/dec cross-attention is set. - - Superset of encoder attention required metadata. - ''' - return (self.is_all_encoder_attn_metadata_set - and (self.cross_slot_mapping is not None) - and (self.cross_block_tables is not None)) - - @property - def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]: - if self.num_prefill_tokens == 0: - return None - return self - - @property - def decode_metadata(self) -> Optional["TorchSDPAMetadata"]: - if self.num_decode_tokens == 0: - return None - return self - - def get_seq_lens( - self, - attn_type: str, - ): - ''' - Extract appropriate sequence lengths from attention metadata - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - * Appropriate sequence lengths tensor for query - * Appropriate sequence lengths tensor for key & value - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - seq_lens_q = self.seq_lens - seq_lens_kv = self.seq_lens - elif attn_type == AttentionType.ENCODER: - seq_lens_q = self.encoder_seq_lens - seq_lens_kv = self.encoder_seq_lens - elif attn_type == AttentionType.ENCODER_DECODER: - seq_lens_q = self.seq_lens - seq_lens_kv = self.encoder_seq_lens - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - return seq_lens_q, seq_lens_kv - - def get_attn_bias( - self, - attn_type: str, - ) -> Optional[list[torch.Tensor]]: - ''' - Extract appropriate attention bias from attention metadata - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - * Appropriate attention bias value given the attention type - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - return self.attn_bias - elif attn_type == AttentionType.ENCODER: - return self.encoder_attn_bias - elif attn_type == AttentionType.ENCODER_DECODER: - return self.cross_attn_bias - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - def set_attn_bias( - self, - attn_bias: list[torch.Tensor], - attn_type: str, - ) -> None: - ''' - Update appropriate attention bias field of attention metadata, - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_bias: The desired attention bias value - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - self.attn_bias = attn_bias - elif attn_type == AttentionType.ENCODER: - self.encoder_attn_bias = attn_bias - elif attn_type == AttentionType.ENCODER_DECODER: - self.cross_attn_bias = attn_bias - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - def get_seq_len_block_table_args( - self, - attn_type: str, - ) -> tuple: - ''' - The particular choice of sequence-length- and block-table-related - attributes which should be extracted from attn_metadata is dependent - on the type of attention operation. - - Decoder attn -> select entirely decoder self-attention-related fields - Encoder/decoder cross-attn -> select encoder sequence lengths & - cross-attn block-tables fields - Encoder attn -> select encoder sequence lengths fields & no block tables - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * is_prompt: True if prefill, False otherwise - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - - * Appropriate sequence-lengths tensor - * Appropriate max sequence-length scalar - * Appropriate block tables (or None) - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - # Decoder self-attention - # Choose max_seq_len based on whether we are in prompt_run - return (self.seq_lens_tensor, self.max_decode_seq_len, - self.block_tables) - elif attn_type == AttentionType.ENCODER_DECODER: - # Enc/dec cross-attention KVs match encoder sequence length; - # cross-attention utilizes special "cross" block tables - return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, - self.cross_block_tables) - elif attn_type == AttentionType.ENCODER: - # No block tables associated with encoder attention - return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, - None) - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]): def __init__(self, runner: CPUModelRunner, kv_cache_spec: AttentionSpec, @@ -432,503 +166,3 @@ def build(self, common_prefix_len: int, ) return attn_metadata - - -class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[list[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0.") - if blocksparse_params is not None: - raise ValueError( - "Torch SPDA does not support block-sparse attention.") - if logits_soft_cap is not None: - logger.warning_once("Torch SPDA does not support logits soft cap. " - "Outputs may be slightly off.") - if use_irope: - logger.warning_once( - "Using irope in Torch SPDA is not supported yet, it will fall" - " back to global attention for long context.") - self.paged_attn_impl = _get_paged_attn_impl() - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = sliding_window - self.kv_cache_dtype = kv_cache_dtype - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.need_mask = (self.alibi_slopes is not None - or self.sliding_window is not None) - - supported_head_sizes = self.paged_attn_impl \ - .get_supported_head_sizes() # type: ignore - if head_size not in supported_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {supported_head_sizes}.") - - if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex: - raise NotImplementedError( - "Torch SDPA backend FP8 KV cache requires " - "intel_extension_for_pytorch support.") - self.attn_type = attn_type - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: TorchSDPAMetadata, # type: ignore - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with torch SDPA and PagedAttention. - - Args: - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - if output_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for TorchSDPABackendImpl") - - # For warming-up - if attn_metadata is None: - return query - - attn_type = self.attn_type - if (attn_type == AttentionType.ENCODER - and (not attn_metadata.is_all_encoder_attn_metadata_set)): - raise AttributeError("Encoder attention requires setting " - "encoder metadata attributes.") - elif (attn_type == AttentionType.ENCODER_DECODER - and (not attn_metadata.is_all_cross_attn_metadata_set)): - raise AttributeError("Encoder/decoder cross-attention " - "requires setting cross-attention " - "metadata attributes.") - - # Reshape the query, key, and value tensors. - query = query.view(-1, self.num_heads, self.head_size) - if key is not None: - assert value is not None - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - else: - assert value is None - - if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0): - # KV-cache during decoder-self- or - # encoder-decoder-cross-attention, but not - # during encoder attention. - # - # Even if there are no new key/value pairs to cache, - # we still need to break out key_cache and value_cache - # i.e. for later use by paged attention - key_cache, value_cache = self.paged_attn_impl \ - .split_kv_cache( # type: ignore - kv_cache, self.num_kv_heads, self.head_size) - - if (key is not None) and (value is not None): - if attn_type == AttentionType.ENCODER_DECODER: - # Update cross-attention KV cache (prefill-only) - # During cross-attention decode, key & value will be None, - # preventing this IF-statement branch from running - updated_slot_mapping = attn_metadata.cross_slot_mapping - else: - # Update self-attention KV cache (prefill/decode) - updated_slot_mapping = attn_metadata.slot_mapping - - self.paged_attn_impl.write_to_paged_cache( # type: ignore - key, value, key_cache, value_cache, updated_slot_mapping, - self.kv_cache_dtype, layer._k_scale, layer._v_scale) - - if attn_type != AttentionType.ENCODER: - # Decoder self-attention supports chunked prefill. - # Encoder/decoder cross-attention requires no chunked - # prefill (100% prefill or 100% decode tokens, no mix) - num_prefill_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - else: - # Encoder attention - chunked prefill is not applicable; - # derive token-count from query shape & and treat them - # as 100% prefill tokens - assert attn_metadata.num_encoder_tokens is not None - num_prefill_tokens = attn_metadata.num_encoder_tokens - num_decode_tokens = 0 - - if attn_type == AttentionType.DECODER: - # Only enforce this shape-constraint for decoder - # self-attention - assert key.shape[0] == num_prefill_tokens + num_decode_tokens - assert value.shape[0] == num_prefill_tokens + num_decode_tokens - - output = torch.empty_like(query) - if prefill_meta := attn_metadata.prefill_metadata: - if not prefill_meta.prefill_metadata.chunked_prefill: # type: ignore - assert attn_metadata.seq_lens is not None - self._run_sdpa_forward(output, - query, - key, - value, - prefill_meta, - attn_type=attn_type) - else: - # prefix-enabled attention - assert not self.need_mask - import intel_extension_for_pytorch.llm.modules as ipex_modules - output = torch.empty_like(query) - ipex_modules.PagedAttention.flash_attn_varlen_func( - output[:prefill_meta.num_prefill_tokens, :, :], - query[:prefill_meta.num_prefill_tokens, :, :], - key_cache, - value_cache, - prefill_meta.prefill_query_start_loc, - prefill_meta.kv_start_loc, - prefill_meta.max_query_len, - prefill_meta.max_kv_len, - self.scale, - True, - prefill_meta.prefill_block_tables, - self.alibi_slopes, - ) - - if decode_meta := attn_metadata.decode_metadata: - assert attn_type != AttentionType.ENCODER_ONLY, ( - "Encoder-only models should not have decode metadata.") - # Decoding run. - ( - seq_lens_arg, - max_seq_len_arg, - block_tables_arg, - ) = decode_meta.get_seq_len_block_table_args(attn_type) - - self.paged_attn_impl.forward_decode( # type: ignore - output[attn_metadata.num_prefill_tokens:, :, :], - query[attn_metadata.num_prefill_tokens:, :, :], - key_cache, - value_cache, - block_tables_arg, - seq_lens_arg, - max_seq_len_arg, - self.kv_cache_dtype, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - layer._k_scale, - layer._v_scale, - ) - - # Reshape the output tensor. - return output.view(-1, self.num_heads * self.head_size) - - def _run_sdpa_forward( - self, - output: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_metadata: TorchSDPAMetadata, - attn_type: str = AttentionType.DECODER, - ) -> None: - if self.num_kv_heads != self.num_heads: - key = key.repeat_interleave(self.num_queries_per_kv, dim=1) - value = value.repeat_interleave(self.num_queries_per_kv, dim=1) - - attn_masks = attn_metadata.get_attn_bias(attn_type) - if attn_masks is None: - if self.alibi_slopes is not None: - attn_masks = _make_alibi_bias( - self.alibi_slopes, query.dtype, - attn_metadata.seq_lens) # type: ignore - elif self.sliding_window is not None: - assert attn_metadata.seq_lens is not None - attn_masks = _make_sliding_window_bias( - attn_metadata.seq_lens, self.sliding_window, - query.dtype) # type: ignore - else: - seq_lens, _ = attn_metadata.get_seq_lens(attn_type) - attn_masks = [None] * len(seq_lens) - attn_metadata.set_attn_bias(attn_masks, attn_type) - - query = query.movedim(0, query.dim() - 2) - key = key.movedim(0, key.dim() - 2) - value = value.movedim(0, value.dim() - 2) - - causal_attn = (attn_type == AttentionType.DECODER) - - seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type) - start_q, start_kv = 0, 0 - for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv, - attn_masks): - end_q = start_q + seq_len_q - end_kv = start_kv + seq_len_kv - sub_out = scaled_dot_product_attention( - query[None, :, start_q:end_q, :], - key[None, :, start_kv:end_kv, :], - value[None, :, start_kv:end_kv, :], - attn_mask=mask, - dropout_p=0.0, - is_causal=causal_attn and mask is None, - scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0) - output[start_q:end_q, :, :] = sub_out - start_q, start_kv = end_q, end_kv - - -def _make_alibi_bias( - alibi_slopes: torch.Tensor, - dtype: torch.dtype, - seq_lens: list[int], -) -> list[torch.Tensor]: - attn_biases: list[torch.Tensor] = [] - for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(seq_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - bias = bias[None, :] - bias[:, None] - - num_heads = alibi_slopes.shape[0] - bias = bias[None, :].repeat((num_heads, 1, 1)) - bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0) - inf_mask = torch.empty( - (1, seq_len, seq_len), - dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1) - attn_biases.append((bias + inf_mask).to(dtype)) - - return attn_biases - - -def _make_sliding_window_bias( - seq_lens: list[int], - window_size: Optional[int], - dtype: torch.dtype, -) -> list[torch.Tensor]: - attn_biases: list[torch.Tensor] = [] - for seq_len in seq_lens: - tensor = torch.full( - (1, seq_len, seq_len), - dtype=dtype, - fill_value=1, - ) - shift = 0 - mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore - if window_size is not None: - mask = torch.triu(mask, diagonal=shift - window_size + 1) - mask = torch.log(mask) - attn_biases.append(mask.to(dtype)) - - return attn_biases - - -class _PagedAttention: - - @staticmethod - def get_supported_head_sizes() -> list[int]: - return [32, 64, 80, 96, 112, 128, 192, 256] - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - *args, - ) -> tuple[int, ...]: - return 2, num_blocks, block_size * num_kv_heads * head_size - - @staticmethod - def split_kv_cache( - kv_cache: torch.Tensor, - num_kv_heads: int, - head_size: int, - *args, - ) -> tuple[torch.Tensor, torch.Tensor]: - x = 16 // kv_cache.element_size() - num_blocks = kv_cache.shape[1] - - key_cache = kv_cache[0] - key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, - -1, x) - value_cache = kv_cache[1] - value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) - return key_cache, value_cache - - @staticmethod - def write_to_paged_cache( - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - slot_mapping: torch.Tensor, - kv_cache_dtype: str, - k_scale: torch.Tensor, - v_scale: torch.Tensor, - *args, - ) -> None: - ops.reshape_and_cache( - key, - value, - key_cache, - value_cache, - slot_mapping.flatten(), - kv_cache_dtype, - k_scale, - v_scale, - ) - - @staticmethod - def forward_decode( - output: torch.Tensor, - query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_tables: torch.Tensor, - context_lens: torch.Tensor, - max_context_len: int, - kv_cache_dtype: str, - num_kv_heads: int, - scale: float, - alibi_slopes: Optional[torch.Tensor], - k_scale: torch.Tensor, - v_scale: torch.Tensor, - *args, - ) -> None: - tp_rank: int = 0 - blocksparse_local_blocks: int = 0 - blocksparse_vert_stride: int = 0 - blocksparse_block_size: int = 64 - blocksparse_head_sliding_step: int = 0 - block_size = value_cache.shape[3] - - ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - context_lens, - block_size, - max_context_len, - alibi_slopes, - kv_cache_dtype, - k_scale, - v_scale, - tp_rank, - blocksparse_local_blocks, - blocksparse_vert_stride, - blocksparse_block_size, - blocksparse_head_sliding_step, - ) - - @staticmethod - def copy_blocks( - kv_caches: list[torch.Tensor], - src_to_dists: torch.Tensor, - *args, - ) -> None: - key_caches = [kv_cache[0] for kv_cache in kv_caches] - value_caches = [kv_cache[1] for kv_cache in kv_caches] - ops.copy_blocks(key_caches, value_caches, src_to_dists) - - -class _IPEXPagedAttention(_PagedAttention): - - @staticmethod - def split_kv_cache( - kv_cache: torch.Tensor, - num_kv_heads: int, - head_size: int, - *args, - ) -> tuple[torch.Tensor, torch.Tensor]: - num_blocks = kv_cache.shape[1] - - key_cache = kv_cache[0] - key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size) - value_cache = kv_cache[1] - value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size) - return key_cache, value_cache - - @staticmethod - def write_to_paged_cache( - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - slot_mapping: torch.Tensor, - kv_cache_dtype: str, - k_scale: torch.Tensor, - v_scale: torch.Tensor, - *args, - ) -> None: - ipex_modules.PagedAttention.reshape_and_cache( - key, value, key_cache, value_cache, - slot_mapping.flatten().int()) - - @staticmethod - def forward_decode( - output: torch.Tensor, - query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_tables: torch.Tensor, - context_lens: torch.Tensor, - max_context_len: int, - kv_cache_dtype: str, - num_kv_heads: int, - scale: float, - alibi_slopes: Optional[torch.Tensor], - k_scale: torch.Tensor, - v_scale: torch.Tensor, - *args, - ) -> None: - block_size = value_cache.shape[2] - head_mapping = torch.arange( - 0, - num_kv_heads, - device="cpu", - dtype=torch.int32, - ).view(num_kv_heads, - 1).repeat_interleave(query.size(1) // num_kv_heads).flatten() - ipex_modules.PagedAttention.single_query_cached_kv_attention( - output, query.contiguous(), key_cache, value_cache, head_mapping, - scale, block_tables, context_lens, block_size, max_context_len, - alibi_slopes) - - -def _get_paged_attn_impl() -> type: - if _use_ipex: - return _IPEXPagedAttention - else: - return _PagedAttention diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index dd8d7994ed33..ebd5914ee40a 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashAttention.""" from dataclasses import dataclass @@ -462,6 +463,13 @@ def forward( query = query[:, :, :num_actual_tokens, :] # Doesn't work for now -> constraint violation # torch._dynamo.try_mark_dynamic(query, 2) + + # default M=64, N=64 may run out of shared memory on + # some GPUs with fp32, so we use smaller M and N. + extra_kernel_options = { + "BLOCK_M": 32, + "BLOCK_N": 32 + } if query.dtype == torch.float32 else {} out = flex_attention_compiled( query, key_cache, @@ -470,7 +478,10 @@ def forward( attn_metadata.block_mask, self.scale, enable_gqa=enable_gqa, - kernel_options={"FORCE_USE_FLEX_ATTENTION": True}, + kernel_options={ + "FORCE_USE_FLEX_ATTENTION": True, + **extra_kernel_options + }, ) # Flex doesn't have an out variant today, rely on epilogue fusion diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index c8ec571989c6..db4b9c9537e5 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index dc8ff2261306..63537384a1da 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with AiterFlashAttention.""" from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 8083f2002602..b0ebb00d9e6b 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -138,7 +138,7 @@ def get_kv_cache_layout(): if cache_layout is None: cache_layout = get_kv_connector_cache_layout() else: - logger.info_once("`FLASHINFER_KV_CACHE_LAYOUT` environment variable " \ + logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \ "detected. Setting KV cache layout to %s.", cache_layout) return cache_layout diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index 5620d9bee7a3..38de00625e3f 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Callable, Optional @@ -26,6 +27,7 @@ def __init__( ): self.kv_cache_config = kv_cache_config self.max_model_len = max_model_len + self.enable_caching = enable_caching self.block_pool = BlockPool(kv_cache_config.num_blocks, enable_caching, enable_kv_cache_events) @@ -267,9 +269,13 @@ def verify_and_split_kv_cache_groups(self) -> None: self.full_attention_block_size = self.full_attention_spec.block_size self.other_block_size = self.other_spec.block_size - assert self.other_block_size % self.full_attention_block_size == 0, ( - "KVCacheCoordinator assumes the block_size of full attention " - "layers is divisible by other layers now.") + + if self.enable_caching: + # this requirement is only needed for the prefix caching logic + divisible = self.other_block_size % self.full_attention_block_size + assert divisible == 0, ( + "KVCacheCoordinator assumes the block_size of full " + "attention layers is divisible by other layers now.") if max(self.full_attention_group_ids) < min(self.other_group_ids): self.full_attn_first = True diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 08bb0efb2f3d..6937455e7d85 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -84,12 +84,15 @@ def __init__( self.log_stats = log_stats # FIXME: make prefix cache stats conditional on log_stats self.prefix_cache_stats = PrefixCacheStats() if log_stats else None - assert len( - set(g.kv_cache_spec.block_size - for g in kv_cache_config.kv_cache_groups) - ) == 1, "Only one block size is supported for now" - self.block_size = kv_cache_config.kv_cache_groups[ - 0].kv_cache_spec.block_size + + self.block_size: Optional[int] = None + if self.enable_caching: + assert len( + set(g.kv_cache_spec.block_size + for g in kv_cache_config.kv_cache_groups) + ) == 1, "Only one block size is supported for now" + self.block_size = kv_cache_config.kv_cache_groups[ + 0].kv_cache_spec.block_size self.coordinator = get_kv_cache_coordinator( kv_cache_config=kv_cache_config, @@ -154,6 +157,7 @@ def get_computed_blocks(self, # if the scheduler has tried to schedule the request before. block_hashes = self.req_to_block_hashes[request.request_id] if not block_hashes: + assert self.block_size is not None block_hashes = hash_request_tokens(self.caching_hash_fn, self.block_size, request) self.req_to_block_hashes[request.request_id] = block_hashes diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 9489bcf433fd..2fbcb569e3d5 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -864,9 +864,11 @@ def _get_kv_cache_config_uniform_page_size( kv_cache_groups=kv_cache_groups, ) + min_block_size = min( + [group.kv_cache_spec.block_size for group in kv_cache_groups]) + # Print the KV cache size and maximum concurrency. - num_tokens = num_blocks // len( - grouped_layers) * vllm_config.cache_config.block_size + num_tokens = num_blocks // len(grouped_layers) * min_block_size num_tokens_str = f"{num_tokens:,}" logger.info("GPU KV cache size: %s tokens", num_tokens_str) max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 7e7703df2cf1..9fc52543efde 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -152,6 +152,11 @@ def _validate_structured_output(self, params: SamplingParams) -> None: if not params.guided_decoding or not self.decoding_config: return + if self.model_config.skip_tokenizer_init and params.guided_decoding: + raise ValueError( + "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501 + ) + engine_level_backend = self.decoding_config.backend if params.guided_decoding.backend: # Request-level backend selection is not supported in V1. diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index c48775adc9b8..43456a987def 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -159,6 +159,7 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: class MambaSpec(KVCacheSpec): shapes: tuple[tuple[int, ...], ...] dtype: torch.dtype + page_size_padded: Optional[int] = None def __post_init__(self): self.num_elements = sum(prod(shape) for shape in self.shapes) @@ -169,7 +170,11 @@ def type_id(self) -> str: @property def page_size_bytes(self) -> int: - return self.num_elements * get_dtype_size(self.dtype) + page_size = self.num_elements * get_dtype_size(self.dtype) + if self.page_size_padded is not None: + assert self.page_size_padded >= page_size + return self.page_size_padded + return page_size def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: # We allocate 1 block for each request now, so max_memory_usage_bytes is diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py index d70a0d044661..5f321cd87c52 100644 --- a/vllm/v1/pool/metadata.py +++ b/vllm/v1/pool/metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py index 9aa560d30eee..16bd2b9ffd84 100644 --- a/vllm/v1/sample/logits_processor.py +++ b/vllm/v1/sample/logits_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from abc import ABC, abstractmethod from collections.abc import Iterator, Sequence diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 30396f159433..87a84e5bf435 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -101,7 +101,10 @@ def forward_cuda( "per-request generators. Falling back to " "PyTorch-native implementation.") return self.forward_native(logits, generators, k, p) - return flashinfer_sample(logits, k, p, generators) + # flashinfer sampling functions expect contiguous logits. + # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous + # because of slicing operation in logits_processor. + return flashinfer_sample(logits.contiguous(), k, p, generators) def forward_tpu( self, diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index c5500b9a384d..839f1da8dd0d 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -40,22 +40,25 @@ def __init__(self, vllm_config: VllmConfig): self._grammar_bitmask: Optional[torch.Tensor] = None self._full_mask = torch.tensor(-1, dtype=torch.int32) - # The default max_workers if not specified is the number of CPUs * 5, - # which is way too high since these tasks are CPU-bound, not I/O bound. - # We also know we would never dominate CPU usage with just grammar - # compilation, so we set it to half the number of CPUs. - max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) - self.executor = ThreadPoolExecutor(max_workers=max_workers) - self.tokenizer = init_tokenizer_from_configs( - model_config=self.vllm_config.model_config, - scheduler_config=self.vllm_config.scheduler_config, - lora_config=self.vllm_config.lora_config, - ).get_lora_tokenizer(None) - reasoning_backend = vllm_config.decoding_config.reasoning_backend - if reasoning_backend: - reasoner_cls = ReasoningParserManager.get_reasoning_parser( - reasoning_backend) - self.reasoner = reasoner_cls(tokenizer=self.tokenizer) + if not self.vllm_config.model_config.skip_tokenizer_init: + # The default max_workers if not specified is the number of + # CPUs * 5, which is way too high since these tasks are CPU-bound, + # not I/O bound. We also know we would never dominate CPU usage + # with just grammar compilation, so we set it to half the number + # of CPUs. + max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) + self.executor = ThreadPoolExecutor(max_workers=max_workers) + self.tokenizer = init_tokenizer_from_configs( + model_config=self.vllm_config.model_config, + scheduler_config=self.vllm_config.scheduler_config, + lora_config=self.vllm_config.lora_config, + ).get_lora_tokenizer(None) + reasoning_backend = \ + self.vllm_config.decoding_config.reasoning_backend + if reasoning_backend: + reasoner_cls = ReasoningParserManager.get_reasoning_parser( + reasoning_backend) + self.reasoner = reasoner_cls(tokenizer=self.tokenizer) def grammar_init(self, request: Request) -> None: if request.structured_output_request is None: diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 370de9f11599..410a54e7466f 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager from typing import Any diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 9a35e8812038..de575d604055 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from importlib import util from typing import Optional diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4786d047acb5..57d0c7b50ff5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -334,6 +334,9 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: # the same order of requests. We ensure this by only allowing the first # group to reorder the batch and asserting that all other groups do not # reorder the batch. + # TODO(tdoublep): make this more flexible so that any group can + # re-order the batch (not only the first). + # TODO(tdoublep): verify this during engine init instead of at runtime for i in range(1, len(self.kv_cache_config.kv_cache_groups)): batch_reordered = self.attn_metadata_builders[i].reorder_batch( self.input_batch, scheduler_output) @@ -2449,6 +2452,7 @@ def _reshape_kv_cache_tensors( corresponding memory buffer for KV cache. """ kv_caches: dict[str, torch.Tensor] = {} + has_attn, has_mamba = False, False for i, kv_cache_group_spec in enumerate( kv_cache_config.kv_cache_groups): kv_cache_spec = kv_cache_group_spec.kv_cache_spec @@ -2458,6 +2462,7 @@ def _reshape_kv_cache_tensors( num_blocks = (raw_tensor.numel() // kv_cache_spec.page_size_bytes) if isinstance(kv_cache_spec, AttentionSpec): + has_attn = True kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) @@ -2486,25 +2491,67 @@ def _reshape_kv_cache_tensors( layer_name].view(dtype).view(kv_cache_shape).permute( *inv_order) elif isinstance(kv_cache_spec, MambaSpec): + has_mamba = True raw_tensor = kv_cache_raw_tensors[layer_name] dtype = kv_cache_spec.dtype + num_element_per_page = (kv_cache_spec.page_size_bytes // + get_dtype_size(dtype)) state_tensors = [] - start_pos = 0 + storage_offset = 0 for shape in kv_cache_spec.shapes: target_shape = (num_blocks, *shape) - size_in_bytes = np.prod(shape) * get_dtype_size( - dtype) * num_blocks - tensor = raw_tensor[start_pos:start_pos + - size_in_bytes] - tensor = tensor.view(dtype).view(target_shape) + stride = torch.empty(target_shape).stride() + target_stride = (num_element_per_page, *stride[1:]) + tensor = torch.as_strided( + raw_tensor.view(dtype), + size=target_shape, + stride=target_stride, + storage_offset=storage_offset, + ) state_tensors.append(tensor) - start_pos += size_in_bytes - assert start_pos == raw_tensor.numel() - kv_caches[layer_name] = tuple(state_tensors) + storage_offset += stride[0] + + kv_caches[layer_name] = state_tensors else: raise NotImplementedError + + if has_attn and has_mamba: + self._verify_hybrid_attention_mamba_layout(kv_cache_config, + kv_cache_raw_tensors) + return kv_caches + def _verify_hybrid_attention_mamba_layout( + self, kv_cache_config: KVCacheConfig, + kv_cache_raw_tensors: dict[str, torch.Tensor]) -> None: + """ + Verify that the KV cache memory layout is compatible for + models with both attention and mamba KV cache groups. + + Args: + kv_cache_config: The KV cache config + kv_cache_raw_tensors: The KV cache buffer of each layer. + """ + + for i, kv_cache_group_spec in enumerate( + kv_cache_config.kv_cache_groups): + kv_cache_spec = kv_cache_group_spec.kv_cache_spec + for layer_name in kv_cache_group_spec.layer_names: + raw_tensor = kv_cache_raw_tensors[layer_name] + num_blocks = (raw_tensor.numel() // + kv_cache_spec.page_size_bytes) + if isinstance(kv_cache_spec, AttentionSpec): + kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( + num_blocks, kv_cache_spec.block_size, + kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) + if kv_cache_shape[0] != num_blocks or kv_cache_shape[ + 1] != 2: + raise ValueError( + "Hybrid models in V1 require an attention " + "backend with kv_cache_shape=" + "(num_blocks, 2, ...). Please try setting " + "VLLM_ATTENTION_BACKEND=FLASHINFER") + def initialize_kv_cache_tensors( self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]: """ @@ -2623,11 +2670,69 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: raise NotImplementedError( "Prefix caching is not supported for Mamba yet.") max_model_len = self.vllm_config.model_config.max_model_len + + page_size_padded = self._maybe_pad_mamba_page_size( + attn_layers, mamba_layers, kv_cache_spec, max_model_len, + block_size) + # Set block_size to max_model_len, so that mamba model will always # have only one block in the KV cache. for layer_name, mamba_module in mamba_layers.items(): kv_cache_spec[layer_name] = MambaSpec( shapes=mamba_module.get_state_shape(), dtype=self.kv_cache_dtype, - block_size=max_model_len) + block_size=max_model_len, + page_size_padded=page_size_padded) + return kv_cache_spec + + def _maybe_pad_mamba_page_size( + self, + attn_layers: dict[str, Attention], + mamba_layers: dict[str, MambaMixer2], + kv_cache_spec: dict[str, KVCacheSpec], + max_model_len: int, + block_size: int, + ) -> Optional[int]: + """ + Ensure that page size of attention KV cache groups is greater than or + equal to the mamba KV cache groups. If not, we suggest to the user + how to set the attention block size to ensure that it is. + + If the attention page size is strictly greater than the mamba page size, + we pad the mamba page size to make them equal. + + Args: + attn_layers: Attention layers + mamba_layers: Mamba layers + kv_cache_spec: KV cache spec (populated with attention layers) + + Returns: + Optional[int]: Mamba page size with padding (None if no padding). + """ + + if len(attn_layers) == 0: + return None + + attn_layer_name = next(iter(attn_layers)) + attn_page_size = kv_cache_spec[attn_layer_name].page_size_bytes + mamba_layer_name = next(iter(mamba_layers)) + mamba_page_size = MambaSpec( + shapes=mamba_layers[mamba_layer_name].get_state_shape(), + dtype=self.kv_cache_dtype, + block_size=max_model_len).page_size_bytes + if attn_page_size < mamba_page_size: + # attention page size (for 16 tokens) + attn_page_size_16 = 16 * attn_page_size // block_size + # some attention backends (e.g. FA) only support setting + # block size to multiple of 16, so let's suggest a value + # that would work (note: FA is currently not compatible + # with mamba layers, use FlashInfer instead). + suggest_attn_block_size = 16 * cdiv(mamba_page_size, + attn_page_size_16) + raise ValueError( + "Attention block size should be increased to at least " + f"{suggest_attn_block_size} in order to match " + "the mamba page size") + + return attn_page_size diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py index 55d116dcd496..4cedc913c2ab 100644 --- a/vllm/v1/worker/xpu_model_runner.py +++ b/vllm/v1/worker/xpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING import torch diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index d9ea03986566..6d1f5749d8b2 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import torch From 8fd66b4fc81254c8a843d6a2fed579cae3c02ad4 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 6 Jul 2025 00:53:14 -0700 Subject: [PATCH 13/13] Fix attn test Signed-off-by: Woosuk Kwon --- tests/kernels/attention/test_attention_selector.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 3ee2a965c58b..7d7522c1fc00 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -82,14 +82,14 @@ def test_env( m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0") if device == "cpu": + if not use_v1: + pytest.skip("CPU backend only supports V1") + with patch("vllm.attention.selector.current_platform", CpuPlatform()): backend = get_attn_backend(16, torch.float16, torch.float16, block_size, False) - if use_v1: - assert backend.get_name() == "TORCH_SDPA_VLLM_V1" - else: - assert backend.get_name() == "TORCH_SDPA" + assert backend.get_name() == "TORCH_SDPA_VLLM_V1" elif device == "hip": with patch("vllm.attention.selector.current_platform", @@ -194,12 +194,14 @@ def test_fp32_fallback( m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") if device == "cpu": + if not use_v1: + pytest.skip("CPU backend only supports V1") + with patch("vllm.attention.selector.current_platform", CpuPlatform()): backend = get_attn_backend(16, torch.float32, torch.float32, 16, False) - assert (backend.get_name() == "TORCH_SDPA_VLLM_V1" - if use_v1 else "TORCH_SDPA") + assert backend.get_name() == "TORCH_SDPA_VLLM_V1" elif device == "cuda": with patch("vllm.attention.selector.current_platform",