update tests

minosfuture · minosfuture · commit dc645298dbd6 · 2025-09-16T21:05:25.000-07:00
Signed-off-by: Ming Yang &lt;minos.future@gmail.com&gt;
diff --git a/hopper/test_flash_attn.py b/hopper/test_flash_attn.py
@@ -122,9 +122,9 @@
     ],
 )
 @pytest.mark.parametrize(
-    "cp_world_size", [4, 2],
+    "cp_world_size", [4, 2, 1], # 1 means disabling cp
 )
-# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
+#@pytest.mark.parametrize('seqlen_q,seqlen_k', [(1, 1)])
 def test_flash_attn_output(
         seqlen_q, seqlen_k, d, causal, local, softcap, V_colmajor, deterministic, has_qv_, mha_type, dtype, test_sink,
         cp_world_size,
@@ -135,6 +135,8 @@ def test_flash_attn_output(
         pytest.skip("Has Qv requires hdim 64 and dtype to be float16 or bfloat16 (not float8_e4m3fn)")
     if test_sink and has_qv_:
         pytest.skip("Sink disabled for Qv")
+    if cp_world_size > 1 and local:
+        pytest.skip("context parallelism is not supported with local attention yet")
     device = "cuda"
     # set seed
     torch.random.manual_seed(0)
diff --git a/hopper/test_util.py b/hopper/test_util.py
@@ -250,12 +250,12 @@ def construct_cp_mask(
 
     # Calculate effective sequence lengths
     sk = (
-        seqlen_k * cp_world_size  # Global seqlen_k for DCP
+        torch.tensor(seqlen_k * cp_world_size, device=device, dtype=torch.long)  # Global seqlen_k for DCP
         if key_padding_mask is None
         else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") * cp_world_size
     )
     sq = (
-        seqlen_q
+        torch.tensor(seqlen_q, device=device, dtype=torch.long)  # Global seqlen_k for DCP
         if query_padding_mask is None
         else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
     )

Original file line number	Diff line number	Diff line change
`@@ -250,12 +250,12 @@ def construct_cp_mask(`
`250`	`250`
`251`	`251`	`# Calculate effective sequence lengths`
`252`	`252`	`sk = (`
`253`		`- seqlen_k * cp_world_size # Global seqlen_k for DCP`
	`253`	`+ torch.tensor(seqlen_k * cp_world_size, device=device, dtype=torch.long) # Global seqlen_k for DCP`
`254`	`254`	`if key_padding_mask is None`
`255`	`255`	`else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") * cp_world_size`
`256`	`256`	`)`
`257`	`257`	`sq = (`
`258`		`- seqlen_q`
	`258`	`+ torch.tensor(seqlen_q, device=device, dtype=torch.long) # Global seqlen_k for DCP`
`259`	`259`	`if query_padding_mask is None`
`260`	`260`	`else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")`
`261`	`261`	`)`