Preliminary blackwell enablement (vllm-project#54)

mxz297 · web-flow · commit 53df680c4065 · 2025-09-28T19:17:21.000-07:00
* Pad flashmla_sparse to 128 on blackwell

* adjust get_max_prefill_buffer_size

* change comments
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -21,6 +21,7 @@
 from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
                                               CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from vllm.model_executor.models.deepseek_v2 import Indexer
@@ -388,13 +389,15 @@ def _forward_bf16_kv(
         kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view(
             -1, 1, kv_c_and_k_pe_cache.shape[-1])
 
-        # NOTE(Chen): kernel requires num_local_head to be a multiple of 64.
-        if self.num_heads % 64 != 0:
-            assert 64 % self.num_heads == 0
+        # NOTE(Chen): kernel requires num_local_head to be a multiple of 
+        # 64 on hopper and 128 on blackwell
+        padding = 128 if current_platform.is_device_capability(100) else 64
+        if self.num_heads % padding != 0:
+            assert padding % self.num_heads == 0
             logger.warning_once(
-                "padding num_heads to 64 due to sparse attn kernel requirement"
+                f"padding num_heads to {padding} due to sparse attn kernel requirement"
             )
-            q_padded = q.new_empty((q.shape[0], 64, q.shape[2]))
+            q_padded = q.new_empty((q.shape[0], padding, q.shape[2]))
             q_padded[:, :self.num_heads, :] = q
             q = q_padded
 
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
@@ -148,8 +148,9 @@ def kv_spans_from_batches(start_seq_loc: torch.Tensor,
 def get_max_prefill_buffer_size(vllm_config: VllmConfig):
     max_model_len = vllm_config.model_config.max_model_len
     max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens
+    max_num_seq = vllm_config.scheduler_config.max_num_seqs
     # NOTE(Chen): an estimated max size of flattened_kv. Need to double check.
-    return max_model_len + max_num_batched_tokens
+    return max_model_len * max_num_seq
 
 
 class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):