NVIDIA · PerkzZheng · Jun 26, 2025 · Jun 18, 2025 · Jun 25, 2025 · Jun 25, 2025
diff --git a/cpp/kernels/fmha_v2/README.md b/cpp/kernels/fmha_v2/README.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-FMHA_v2 is just a bunch of Multi-head Attention kernels that we’ve enabled for known cases. It’s not built as a library (cuBLAS, cuDNN, HazyResearch's MHA, etc) that is supposed to deliver good perf for all cases. End users will get access to FMHA through products or libraries, not directly through FMHA_v2.
+FMHA_v2 is just a bunch of Multi-head Attention kernels that we've enabled for known cases. It's not built as a library (cuBLAS, cuDNN, HazyResearch's MHA, etc) that is supposed to deliver good perf for all cases. End users will get access to FMHA through products or libraries, not directly through FMHA_v2.
 
 ## Launch a container to build the code
 
@@ -80,3 +80,11 @@ Why is the FMHA_v2 slower than public implementation in several cases?
 ```
 Usually, adding new launch configurations suffices. The heuristics of FMHA_v2 are designed to work optimally for known cases. If you encounter an unknown case, first check if FMHA_v2 has a suitable kernel. If there isn't one, feel free to approach us and we'll enable a new configuration
 ```
+
+What's the difference between cubins and cu files?
+
+'''
+Cubins are precompiled (from the internal fmha_v2 repo) binary files and take a lot of space, cu files are generated directly from this repo. Now we replace most of the kernels with cu files and delete unused cubins.
+You can modify code in this repo to change or create your own kernels and run.
+Now there are some kernels still running in cubins. See use_cubin_header(setup.py#L3055) and modify_cubin_header(setup.py#L3413) for details.
+'''
diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py
@@ -3049,14 +3049,20 @@ def get_kernel_traits_code(specs_names):
     return code
 
 
+# For now, only hopper head_size 128 kernel uses cubins, and other kernels use cu files.
+# You should set the condition `use_cubin_header` to false if you have modified the source code of the FMHA kernels on Hopper (sm90) with head_size 128.
+# This ensures that the kernels will be recompiled using the updated source code rather than relying on precompiled cubins.
+def use_cubin_header(kspec):
+    return kspec.sm == 90 and kspec.head_size == 128
+
+
 def get_cubin_header(kernel_traits, specs_names):
     cubins = []
     cubin_lens = []
     cubins_dict = {}
     cubin_lens_dict = {}
     for kspec, fname, lname, kname in specs_names:
-        # only generate hopper cubin header
-        if generate_cu_trtllm and not 'sm90' in kname:
+        if generate_cu_trtllm and not use_cubin_header(kspec):
             continue
         name = fname.replace('.', '_')
         data = 'extern unsigned char cubin_{name}_cubin[];'.format(name=name)
@@ -3209,7 +3215,7 @@ def get_cubin_header(kernel_traits, specs_names):
             if generate_cu_trtllm:
 
                 def get_lname_from_kname(kname: str) -> str:
-                    if 'sm90' in kname:
+                    if use_cubin_header(kspec):
                         return 'nullptr'
                     lname = kname.replace('_kernel', '')
                     mask_types = [
@@ -3228,7 +3234,7 @@ def get_lname_from_kname(kname: str) -> str:
 {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
 {attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
 {is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\
-'''.format(**locals()) if 'sm90' in kname else '''\
+'''.format(**locals()) if use_cubin_header(kspec) else '''\
 {{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \
 {sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, nullptr, \
 0, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
@@ -3404,6 +3410,9 @@ def get_lname_from_kname(kname: str) -> str:
     return code
 
 
+# This is used to add some kernels running in cubins.
+# The source code of paged context fmha kernels are not in this repo, but we have cubins for them.
+# Other kernels are for passing CI cases.
 def modify_cubin_header(cubin_header):
     # for paged context fmha cases
     target = "#ifndef EXCLUDE_SM_90"

diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
diff --git a/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_32_ldgsts_sm90.cubin.cpp b/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_32_ldgsts_sm90.cubin.cpp
diff --git a/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_64_ldgsts_sm90.cubin.cpp b/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_64_ldgsts_sm90.cubin.cpp
diff --git a/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_32_ldgsts_sm90.cubin.cpp b/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_32_ldgsts_sm90.cubin.cpp
diff --git a/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_64_ldgsts_sm90.cubin.cpp b/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_64_ldgsts_sm90.cubin.cpp
diff --git a/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_32_ldgsts_sm90.cubin.cpp b/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_32_ldgsts_sm90.cubin.cpp
diff --git a/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_64_ldgsts_sm90.cubin.cpp b/...lm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_64_ldgsts_sm90.cubin.cpp
diff --git a/...llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_64_32_ldgsts_sm90.cubin.cpp b/...llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_64_32_ldgsts_sm90.cubin.cpp
diff --git a/...llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_64_64_ldgsts_sm90.cubin.cpp b/...llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_64_64_ldgsts_sm90.cubin.cpp
diff --git a/...FusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90.cubin.cpp b/...FusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90.cubin.cpp
diff --git a/...FusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90.cubin.cpp b/...FusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90.cubin.cpp
diff --git a/...FusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90.cubin.cpp b/...FusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90.cubin.cpp
diff --git a/...adAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90.cubin.cpp b/...adAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90.cubin.cpp
diff --git a/...tention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90.cubin.cpp b/...tention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90.cubin.cpp
diff --git a/...tiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp b/...tiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp
diff --git a/...tention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90.cubin.cpp b/...tention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90.cubin.cpp
diff --git a/...tiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp b/...tiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp
diff --git a/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp b/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp b/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp
diff --git a/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp b/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp b/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp
diff --git a/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp b/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp b/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp
diff --git a/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp b/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp b/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp
diff --git a/...FusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90.cubin.cpp b/...FusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90.cubin.cpp
diff --git a/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90.cubin.cpp b/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90.cubin.cpp
diff --git a/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90.cubin.cpp b/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90.cubin.cpp
diff --git a/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90.cubin.cpp b/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90.cubin.cpp
diff --git a/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90.cubin.cpp b/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90.cubin.cpp
diff --git a/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90.cubin.cpp b/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90.cubin.cpp
diff --git a/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90.cubin.cpp b/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...bin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90.cubin.cpp b/...bin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90.cubin.cpp
diff --git a/...eadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90.cubin.cpp b/...eadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90.cubin.cpp
diff --git a/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_160_alibi_tma_ws_sm90.cubin.cpp b/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_160_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90.cubin.cpp b/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90.cubin.cpp
diff --git a/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90.cubin.cpp b/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90.cubin.cpp
diff --git a/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_192_alibi_tma_ws_sm90.cubin.cpp b/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_192_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90.cubin.cpp b/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90.cubin.cpp
diff --git a/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_alibi_tma_ws_sm90.cubin.cpp b/...dAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...tion/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90.cubin.cpp b/...tion/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90.cubin.cpp
diff --git a/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90.cubin.cpp b/...ltiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90.cubin.cpp
diff --git a/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90.cubin.cpp b/...tFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90.cubin.cpp
diff --git a/...tiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp b/...tiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp
diff --git a/...tiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp b/...tiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp
diff --git a/...on/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90.cubin.cpp b/...on/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...ttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90.cubin.cpp b/...ttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90.cubin.cpp
diff --git a/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_alibi_tma_ws_sm90.cubin.cpp b/...ion/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90.cubin.cpp
diff --git a/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90.cubin.cpp b/...Attention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...tiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90.cubin.cpp b/...tiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90.cubin.cpp
diff --git a/...dAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp b/...dAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...ltiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp b/...ltiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp
diff --git a/...dAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp b/...dAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...ltiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp b/...ltiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp
diff --git a/...dAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp b/...dAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...ltiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp b/...ltiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp
diff --git a/...dAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp b/...dAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp
diff --git a/...ltiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp b/...ltiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp