update vllm interface

minosfuture · minosfuture · commit bd4bfcf1036c · 2025-09-17T00:34:36.000-07:00
Signed-off-by: Ming Yang &lt;minos.future@gmail.com&gt;
diff --git a/hopper/flash_api_torch_lib.cpp b/hopper/flash_api_torch_lib.cpp
@@ -54,7 +54,7 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         int const sm_margin,
         std::optional<const at::Tensor> &s_aux_,
         int const cp_world_size,
-        int const cp_rank,
+        int const cp_rank
 );
 
 // Only applicable to the case where seqused_k (i.e. cache_seqlens) is available
@@ -124,7 +124,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
             "    int      sm_margin,"
             "    Tensor?  s_aux,"
             "    int      cp_world_size,"
-            "    int      cp_rank") -> Tensor[]");
+            "    int      cp_rank) -> Tensor[]");
     ops.impl("fwd", torch::kCUDA, make_pytorch_shim(&mha_fwd));
 
     ops.def("get_scheduler_metadata("
diff --git a/vllm_flash_attn/flash_attn_interface.py b/vllm_flash_attn/flash_attn_interface.py
@@ -146,6 +146,8 @@ def flash_attn_varlen_func(
     # Version selector
     fa_version: int = DEFAULT_FA_VERSION,
     s_aux=None,
+    cp_world_size=1,
+    cp_rank=0,
 ):
     """dropout_p should be set to 0.0 during evaluation
     Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads
@@ -279,7 +281,9 @@ def flash_attn_varlen_func(
             num_splits,
             None,             # pack_gqa
             0,                # sm_margin
-            s_aux             # s_aux
+            s_aux,            # s_aux
+            cp_world_size,
+            cp_rank,
         )
     else:
         raise ValueError(f"Unsupported FA version: {fa_version}")
@@ -316,6 +320,8 @@ def flash_attn_with_kvcache(
     # Version selector
     fa_version: int = DEFAULT_FA_VERSION,
     s_aux=None,
+    cp_world_size=1,
+    cp_rank=0,
 ):
     """
     If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from