diff --git a/cpp/kernels/fmha_v2/README.md b/cpp/kernels/fmha_v2/README.md index ed474cf6035..ce189f21875 100644 --- a/cpp/kernels/fmha_v2/README.md +++ b/cpp/kernels/fmha_v2/README.md @@ -2,7 +2,7 @@ ## Introduction -FMHA_v2 is just a bunch of Multi-head Attention kernels that we’ve enabled for known cases. It’s not built as a library (cuBLAS, cuDNN, HazyResearch's MHA, etc) that is supposed to deliver good perf for all cases. End users will get access to FMHA through products or libraries, not directly through FMHA_v2. +FMHA_v2 is just a bunch of Multi-head Attention kernels that we've enabled for known cases. It's not built as a library (cuBLAS, cuDNN, HazyResearch's MHA, etc) that is supposed to deliver good perf for all cases. End users will get access to FMHA through products or libraries, not directly through FMHA_v2. ## Launch a container to build the code @@ -80,3 +80,11 @@ Why is the FMHA_v2 slower than public implementation in several cases? ``` Usually, adding new launch configurations suffices. The heuristics of FMHA_v2 are designed to work optimally for known cases. If you encounter an unknown case, first check if FMHA_v2 has a suitable kernel. If there isn't one, feel free to approach us and we'll enable a new configuration ``` + +What's the difference between cubins and cu files? + +''' +Cubins are precompiled (from the internal fmha_v2 repo) binary files and take a lot of space, cu files are generated directly from this repo. Now we replace most of the kernels with cu files and delete unused cubins. +You can modify code in this repo to change or create your own kernels and run. +Now there are some kernels still running in cubins. See use_cubin_header(setup.py#L3055) and modify_cubin_header(setup.py#L3413) for details. +''' diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py index ec0032cfaca..4550c23f6bd 100644 --- a/cpp/kernels/fmha_v2/setup.py +++ b/cpp/kernels/fmha_v2/setup.py @@ -3049,14 +3049,20 @@ def get_kernel_traits_code(specs_names): return code +# For now, only hopper head_size 128 kernel uses cubins, and other kernels use cu files. +# You should set the condition `use_cubin_header` to false if you have modified the source code of the FMHA kernels on Hopper (sm90) with head_size 128. +# This ensures that the kernels will be recompiled using the updated source code rather than relying on precompiled cubins. +def use_cubin_header(kspec): + return kspec.sm == 90 and kspec.head_size == 128 + + def get_cubin_header(kernel_traits, specs_names): cubins = [] cubin_lens = [] cubins_dict = {} cubin_lens_dict = {} for kspec, fname, lname, kname in specs_names: - # only generate hopper cubin header - if generate_cu_trtllm and not 'sm90' in kname: + if generate_cu_trtllm and not use_cubin_header(kspec): continue name = fname.replace('.', '_') data = 'extern unsigned char cubin_{name}_cubin[];'.format(name=name) @@ -3209,7 +3215,7 @@ def get_cubin_header(kernel_traits, specs_names): if generate_cu_trtllm: def get_lname_from_kname(kname: str) -> str: - if 'sm90' in kname: + if use_cubin_header(kspec): return 'nullptr' lname = kname.replace('_kernel', '') mask_types = [ @@ -3228,7 +3234,7 @@ def get_lname_from_kname(kname: str) -> str: {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \ {attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \ {is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\ -'''.format(**locals()) if 'sm90' in kname else '''\ +'''.format(**locals()) if use_cubin_header(kspec) else '''\ {{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \ {sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, nullptr, \ 0, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \ @@ -3404,6 +3410,9 @@ def get_lname_from_kname(kname: str) -> str: return code +# This is used to add some kernels running in cubins. +# The source code of paged context fmha kernels are not in this repo, but we have cubins for them. +# Other kernels are for passing CI cases. def modify_cubin_header(cubin_header): # for paged context fmha cases target = "#ifndef EXCLUDE_SM_90" diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h index a0562adb1d0..5b3206d435d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h @@ -28,360 +28,434 @@ namespace kernels #ifndef EXCLUDE_SM_90 extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_softcapping_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_softcapping_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_softcapping_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin[]; -extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_cu_cubin[]; +extern void run_fmha_v2_fp16_64_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_64_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_128_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_128_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_256_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_256_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_64_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_64_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_128_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_128_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_256_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_256_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_384_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_384_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_512_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_512_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_384_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_384_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_512_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_512_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_64_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_64_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_128_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_128_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_256_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_256_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_64_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_64_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_128_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_128_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_256_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_256_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_384_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_384_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_512_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_512_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_384_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_384_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_512_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_bf16_512_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_64_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_128_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_256_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_64_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_128_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_256_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_384_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_512_32_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_384_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_512_64_ldgsts_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_softcapping_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_softcapping_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_softcapping_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_nl(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_nl_tiled(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); #endif #ifndef EXCLUDE_SM_89 @@ -1320,360 +1394,50 @@ extern void run_fmha_v2_flash_attention_fp16_fp32_64_16_S_q_paged_kv_256_softcap #ifndef EXCLUDE_SM_90 extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_softcapping_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_softcapping_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_softcapping_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len; -extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_cu_cubin_len; #endif @@ -1711,960 +1475,960 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2 #ifndef EXCLUDE_SM_90 { DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_causal_tma_ws_sm90_kernel", 213248, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, { DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_causal_tma_ws_sm90_kernel", 213248, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_32_ldgsts_sm90_kernel", 17408, 128, 0, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 17408, 128, 0, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_32_causal_ldgsts_sm90_kernel", 17408, 128, 0, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_32_ldgsts_sm90_kernel_nl", 17408, 128, 64, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_32_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_32_ldgsts_sm90_kernel", 25600, 128, 0, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 25600, 128, 0, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_32_causal_ldgsts_sm90_kernel", 25600, 128, 0, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_32_ldgsts_sm90_kernel_nl", 25600, 128, 64, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_32_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_32_ldgsts_sm90_kernel", 41984, 128, 0, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 41984, 128, 0, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_32_causal_ldgsts_sm90_kernel", 41984, 128, 0, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_32_ldgsts_sm90_kernel_nl", 41984, 128, 64, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_32_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_64_ldgsts_sm90_kernel", 33792, 128, 0, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 33792, 128, 0, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_64_causal_ldgsts_sm90_kernel", 33792, 128, 0, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_64_ldgsts_sm90_kernel_nl", 33792, 128, 64, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_64_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_64_ldgsts_sm90_kernel", 50176, 128, 0, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 50176, 128, 0, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_64_causal_ldgsts_sm90_kernel", 50176, 128, 0, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_64_ldgsts_sm90_kernel_nl", 50176, 128, 64, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_64_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_64_ldgsts_sm90_kernel", 82944, 128, 0, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 82944, 128, 0, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_64_causal_ldgsts_sm90_kernel", 82944, 128, 0, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_64_ldgsts_sm90_kernel_nl", 82944, 128, 64, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_64_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_32_ldgsts_sm90_kernel", 67072, 256, 0, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 67072, 256, 0, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_32_causal_ldgsts_sm90_kernel", 67072, 256, 0, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_32_ldgsts_sm90_kernel_nl", 67072, 256, 64, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_32_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_32_ldgsts_sm90_kernel", 83456, 256, 0, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 83456, 256, 0, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_32_causal_ldgsts_sm90_kernel", 83456, 256, 0, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_32_ldgsts_sm90_kernel_nl", 83456, 256, 64, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_32_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_64_ldgsts_sm90_kernel", 132608, 256, 0, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 132608, 256, 0, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_64_causal_ldgsts_sm90_kernel", 132608, 256, 0, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_64_ldgsts_sm90_kernel_nl", 132608, 256, 64, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_64_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_64_ldgsts_sm90_kernel", 165376, 256, 0, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 165376, 256, 0, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_64_causal_ldgsts_sm90_kernel", 165376, 256, 0, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_64_ldgsts_sm90_kernel_nl", 165376, 256, 64, 0, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_64_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 1, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 2, 0, false, false, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_32_ldgsts_sm90_kernel", 17408, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 17408, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_32_causal_ldgsts_sm90_kernel", 17408, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_32_ldgsts_sm90_kernel_nl", 17408, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_32_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_32_ldgsts_sm90_kernel", 25600, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 25600, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_32_causal_ldgsts_sm90_kernel", 25600, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_32_ldgsts_sm90_kernel_nl", 25600, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_32_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_32_ldgsts_sm90_kernel", 41984, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 41984, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_32_causal_ldgsts_sm90_kernel", 41984, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_32_ldgsts_sm90_kernel_nl", 41984, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_32_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_64_ldgsts_sm90_kernel", 33792, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 33792, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_64_causal_ldgsts_sm90_kernel", 33792, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_64_ldgsts_sm90_kernel_nl", 33792, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_64_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_64_ldgsts_sm90_kernel", 50176, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 50176, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_64_causal_ldgsts_sm90_kernel", 50176, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_64_ldgsts_sm90_kernel_nl", 50176, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_64_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_64_ldgsts_sm90_kernel", 82944, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 82944, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_64_causal_ldgsts_sm90_kernel", 82944, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_64_ldgsts_sm90_kernel_nl", 82944, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_64_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_32_ldgsts_sm90_kernel", 67072, 256, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 67072, 256, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_32_causal_ldgsts_sm90_kernel", 67072, 256, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_32_ldgsts_sm90_kernel_nl", 67072, 256, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_32_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_32_ldgsts_sm90_kernel", 83456, 256, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 83456, 256, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_32_causal_ldgsts_sm90_kernel", 83456, 256, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_32_ldgsts_sm90_kernel_nl", 83456, 256, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_32_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_64_ldgsts_sm90_kernel", 132608, 256, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 132608, 256, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_64_causal_ldgsts_sm90_kernel", 132608, 256, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_64_ldgsts_sm90_kernel_nl", 132608, 256, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_64_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_64_ldgsts_sm90_kernel", 165376, 256, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 165376, 256, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_64_causal_ldgsts_sm90_kernel", 165376, 256, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_64_ldgsts_sm90_kernel_nl", 165376, 256, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_64_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_bf16_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_bf16_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_32_ldgsts_sm90_kernel", 17408, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 17408, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_32_causal_ldgsts_sm90_kernel", 17408, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_32_ldgsts_sm90_kernel_nl", 17408, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_32_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_32_ldgsts_sm90_kernel", 25600, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 25600, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_32_causal_ldgsts_sm90_kernel", 25600, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_32_ldgsts_sm90_kernel_nl", 25600, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_32_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_32_ldgsts_sm90_kernel", 41984, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 41984, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_32_causal_ldgsts_sm90_kernel", 41984, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_32_ldgsts_sm90_kernel_nl", 41984, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_32_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_64_ldgsts_sm90_kernel", 33792, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 33792, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_64_causal_ldgsts_sm90_kernel", 33792, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_64_ldgsts_sm90_kernel_nl", 33792, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_64_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_64_ldgsts_sm90_kernel", 50176, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 50176, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_64_causal_ldgsts_sm90_kernel", 50176, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_64_ldgsts_sm90_kernel_nl", 50176, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_64_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_64_ldgsts_sm90_kernel", 82944, 128, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 82944, 128, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_64_causal_ldgsts_sm90_kernel", 82944, 128, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_64_ldgsts_sm90_kernel_nl", 82944, 128, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_64_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_32_ldgsts_sm90_kernel", 67072, 256, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 67072, 256, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_32_causal_ldgsts_sm90_kernel", 67072, 256, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_32_ldgsts_sm90_kernel_nl", 67072, 256, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_32_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_32_ldgsts_sm90_kernel", 83456, 256, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 83456, 256, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_32_causal_ldgsts_sm90_kernel", 83456, 256, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_32_ldgsts_sm90_kernel_nl", 83456, 256, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_32_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_64_ldgsts_sm90_kernel", 132608, 256, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 132608, 256, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_64_causal_ldgsts_sm90_kernel", 132608, 256, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_64_ldgsts_sm90_kernel_nl", 132608, 256, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_64_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_64_ldgsts_sm90_kernel", 165376, 256, 0, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 165376, 256, 0, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_64_causal_ldgsts_sm90_kernel", 165376, 256, 0, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_64_ldgsts_sm90_kernel_nl", 165376, 256, 64, 0, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_64_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 1, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin, cubin_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_cu_cubin_len, "fmha_v2_fp16_fp32_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 2, 0, false, false, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, false, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, false, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, false, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, false, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_custom_mask_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, false, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, false, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_custom_mask_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, false, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, false, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, false, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, false, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, false, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_qkv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_custom_mask_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_custom_mask_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_qkv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_kernel", 82304, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_causal_tma_ws_sm90_kernel", 82304, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 78208, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_custom_mask_tma_ws_sm90_kernel", 82304, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_kernel", 164224, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_kernel", 196864, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 180480, 384, 64, 2, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_causal_softcapping_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90_kernel", 82304, 384, 64, 0, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_custom_mask_tma_ws_sm90_kernel", 82304, 384, 64, 3, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90_kernel", 164224, 384, 64, 0, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90_kernel", 196864, 384, 64, 0, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_kernel", 82304, 384, 64, 0, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_causal_tma_ws_sm90_kernel", 82304, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 78208, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_custom_mask_tma_ws_sm90_kernel", 82304, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_kernel", 164224, 384, 64, 0, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_kernel", 196864, 384, 64, 0, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_causal_softcapping_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 82304, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_causal_alibi_tma_ws_sm90_kernel", 82304, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 256, 80, 80, 64, 64, 256, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90_kernel", 196864, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 256, 128, 128, 64, 64, 256, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90_kernel", 196864, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_custom_mask_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_custom_mask_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, true, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_16_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_16_custom_mask_sm90_kernel_nl_tiled", 16384, 128, 128, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_kernel_nl_tiled", 32768, 128, 128, 0, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_32_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_32_custom_mask_sm90_kernel_nl_tiled", 32768, 128, 128, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_40_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_40_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_48_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_48_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_kernel_nl_tiled", 65536, 128, 128, 0, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_64_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_128_128_S_qkv_64_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_160_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_160_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_192_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_192_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_256_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_256_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_16_causal_sm90_kernel_nl", 6144, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl", 6144, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_16_custom_mask_sm90_kernel_nl", 6144, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_kernel_nl", 12288, 128, 64, 0, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_32_causal_sm90_kernel_nl", 12288, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl", 12288, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_64_S_qkv_32_custom_mask_sm90_kernel_nl", 12288, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_40_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_40_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_48_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_48_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_kernel_nl", 16384, 128, 64, 0, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_64_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_64_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_72_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_72_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_80_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_80_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_96_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_96_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_104_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_104_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_16_S_qkv_160_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_16_S_qkv_160_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_16_S_qkv_192_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_16_S_qkv_192_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_16_S_qkv_256_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_16_S_qkv_256_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, false, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_256_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_16_S_qkv_256_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, false, true, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, false, true, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_16_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_16_custom_mask_sm90_kernel_nl_tiled", 16384, 128, 128, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_kernel_nl_tiled", 32768, 128, 128, 0, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_32_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_32_custom_mask_sm90_kernel_nl_tiled", 32768, 128, 128, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_40_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_40_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_48_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_48_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_kernel_nl_tiled", 65536, 128, 128, 0, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_64_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_128_128_S_qkv_64_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_160_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_160_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_192_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_192_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_256_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_256_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_16_causal_sm90_kernel_nl", 6144, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl", 6144, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_16_custom_mask_sm90_kernel_nl", 6144, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_kernel_nl", 12288, 128, 64, 0, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_32_causal_sm90_kernel_nl", 12288, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl", 12288, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_qkv_32_custom_mask_sm90_kernel_nl", 12288, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_40_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_40_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_48_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_48_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_kernel_nl", 16384, 128, 64, 0, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_64_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_64_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_72_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_72_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_80_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_80_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_96_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_96_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_104_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_104_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_16_S_qkv_160_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_16_S_qkv_160_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_16_S_qkv_192_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_16_S_qkv_192_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_16_S_qkv_256_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_16_S_qkv_256_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_256_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_16_S_qkv_256_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_custom_mask_sm90_kernel_nl_tiled", 16384, 128, 128, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_kernel_nl_tiled", 32768, 128, 128, 0, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_custom_mask_sm90_kernel_nl_tiled", 32768, 128, 128, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_kernel_nl_tiled", 65536, 128, 128, 0, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_causal_sm90_kernel_nl", 6144, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl", 6144, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_custom_mask_sm90_kernel_nl", 6144, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_kernel_nl", 12288, 128, 64, 0, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_causal_sm90_kernel_nl", 12288, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl", 12288, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_custom_mask_sm90_kernel_nl", 12288, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_kernel_nl", 16384, 128, 64, 0, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, true, false, nullptr}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, true, false, nullptr}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 576, 512, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_kernel_nl_tiled", 49152, 128, 64, 0, 2, false, true, false, true, true, true, false, false, nullptr}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_32_ldgsts_sm90_kernel", 17408, 128, 0, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 17408, 128, 0, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_32_causal_ldgsts_sm90_kernel", 17408, 128, 0, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_32_ldgsts_sm90_kernel_nl", 17408, 128, 64, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_32_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_32_ldgsts_sm90_kernel", 25600, 128, 0, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 25600, 128, 0, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_32_causal_ldgsts_sm90_kernel", 25600, 128, 0, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_32_ldgsts_sm90_kernel_nl", 25600, 128, 64, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_32_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_32_ldgsts_sm90_kernel", 41984, 128, 0, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 41984, 128, 0, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_32_causal_ldgsts_sm90_kernel", 41984, 128, 0, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_32_ldgsts_sm90_kernel_nl", 41984, 128, 64, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_32_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_64_ldgsts_sm90_kernel", 33792, 128, 0, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 33792, 128, 0, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_64_causal_ldgsts_sm90_kernel", 33792, 128, 0, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_64_ldgsts_sm90_kernel_nl", 33792, 128, 64, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_64_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_64_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_64_ldgsts_sm90_kernel", 50176, 128, 0, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 50176, 128, 0, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_64_causal_ldgsts_sm90_kernel", 50176, 128, 0, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_64_ldgsts_sm90_kernel_nl", 50176, 128, 64, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_64_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_128_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_64_ldgsts_sm90_kernel", 82944, 128, 0, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 82944, 128, 0, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_64_causal_ldgsts_sm90_kernel", 82944, 128, 0, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_64_ldgsts_sm90_kernel_nl", 82944, 128, 64, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_64_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_256_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_32_ldgsts_sm90_kernel", 67072, 256, 0, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 67072, 256, 0, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_32_causal_ldgsts_sm90_kernel", 67072, 256, 0, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_32_ldgsts_sm90_kernel_nl", 67072, 256, 64, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_32_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_32_ldgsts_sm90_kernel", 83456, 256, 0, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 83456, 256, 0, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_32_causal_ldgsts_sm90_kernel", 83456, 256, 0, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_32_ldgsts_sm90_kernel_nl", 83456, 256, 64, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_32_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_64_ldgsts_sm90_kernel", 132608, 256, 0, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 132608, 256, 0, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_64_causal_ldgsts_sm90_kernel", 132608, 256, 0, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_64_ldgsts_sm90_kernel_nl", 132608, 256, 64, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_64_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_384_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_64_ldgsts_sm90_kernel", 165376, 256, 0, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 165376, 256, 0, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_64_causal_ldgsts_sm90_kernel", 165376, 256, 0, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_64_ldgsts_sm90_kernel_nl", 165376, 256, 64, 0, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_64_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 1, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 2, 0, false, false, false, false, true, false, false, false, run_fmha_v2_fp16_512_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_32_ldgsts_sm90_kernel", 17408, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 17408, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_32_causal_ldgsts_sm90_kernel", 17408, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_32_ldgsts_sm90_kernel_nl", 17408, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_32_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_32_ldgsts_sm90_kernel", 25600, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 25600, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_32_causal_ldgsts_sm90_kernel", 25600, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_32_ldgsts_sm90_kernel_nl", 25600, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_32_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_32_ldgsts_sm90_kernel", 41984, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 41984, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_32_causal_ldgsts_sm90_kernel", 41984, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_32_ldgsts_sm90_kernel_nl", 41984, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_32_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_64_ldgsts_sm90_kernel", 33792, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 33792, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_64_causal_ldgsts_sm90_kernel", 33792, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_64_ldgsts_sm90_kernel_nl", 33792, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_64_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_64_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_64_ldgsts_sm90_kernel", 50176, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 50176, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_64_causal_ldgsts_sm90_kernel", 50176, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_64_ldgsts_sm90_kernel_nl", 50176, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_64_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_128_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_64_ldgsts_sm90_kernel", 82944, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 82944, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_64_causal_ldgsts_sm90_kernel", 82944, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_64_ldgsts_sm90_kernel_nl", 82944, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_64_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_256_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_32_ldgsts_sm90_kernel", 67072, 256, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 67072, 256, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_32_causal_ldgsts_sm90_kernel", 67072, 256, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_32_ldgsts_sm90_kernel_nl", 67072, 256, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_32_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_32_ldgsts_sm90_kernel", 83456, 256, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 83456, 256, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_32_causal_ldgsts_sm90_kernel", 83456, 256, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_32_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_32_ldgsts_sm90_kernel_nl", 83456, 256, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_32_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_32_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_64_ldgsts_sm90_kernel", 132608, 256, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 132608, 256, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_64_causal_ldgsts_sm90_kernel", 132608, 256, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_64_ldgsts_sm90_kernel_nl", 132608, 256, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_64_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_384_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_64_ldgsts_sm90_kernel", 165376, 256, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 165376, 256, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_64_causal_ldgsts_sm90_kernel", 165376, 256, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_64_ldgsts_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_64_ldgsts_sm90_kernel_nl", 165376, 256, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_64_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_64_ldgsts_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_bf16_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_bf16_512_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_32_ldgsts_sm90_kernel", 17408, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 17408, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_32_causal_ldgsts_sm90_kernel", 17408, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_32_ldgsts_sm90_kernel_nl", 17408, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_32_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 17408, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_32_ldgsts_sm90_kernel", 25600, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 25600, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_32_causal_ldgsts_sm90_kernel", 25600, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_32_ldgsts_sm90_kernel_nl", 25600, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_32_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 25600, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_32_ldgsts_sm90_kernel", 41984, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 41984, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_32_causal_ldgsts_sm90_kernel", 41984, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_32_ldgsts_sm90_kernel_nl", 41984, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_32_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 41984, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_64_ldgsts_sm90_kernel", 33792, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 33792, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_64_causal_ldgsts_sm90_kernel", 33792, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_64_ldgsts_sm90_kernel_nl", 33792, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_64_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 64, 64, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_64_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 33792, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_64_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_64_ldgsts_sm90_kernel", 50176, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 50176, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_64_causal_ldgsts_sm90_kernel", 50176, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_64_ldgsts_sm90_kernel_nl", 50176, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_64_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 64, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_128_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 50176, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_128_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_64_ldgsts_sm90_kernel", 82944, 128, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 82944, 128, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_64_causal_ldgsts_sm90_kernel", 82944, 128, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_64_ldgsts_sm90_kernel_nl", 82944, 128, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_64_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_256_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 82944, 128, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_256_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_32_ldgsts_sm90_kernel", 67072, 256, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 67072, 256, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_32_causal_ldgsts_sm90_kernel", 67072, 256, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_32_ldgsts_sm90_kernel_nl", 67072, 256, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_32_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 67072, 256, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_32_ldgsts_sm90_kernel", 83456, 256, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel", 83456, 256, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_32_causal_ldgsts_sm90_kernel", 83456, 256, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_32_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_32_ldgsts_sm90_kernel_nl", 83456, 256, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_32_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_32_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 83456, 256, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_32_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_64_ldgsts_sm90_kernel", 132608, 256, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 132608, 256, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_64_causal_ldgsts_sm90_kernel", 132608, 256, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_64_ldgsts_sm90_kernel_nl", 132608, 256, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_64_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 384, 64, 384, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_384_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 132608, 256, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_384_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_64_ldgsts_sm90_kernel", 165376, 256, 0, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel", 165376, 256, 0, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_64_causal_ldgsts_sm90_kernel", 165376, 256, 0, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_64_ldgsts_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_64_ldgsts_sm90_kernel_nl", 165376, 256, 64, 0, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_64_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 1, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 512, 64, 512, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_fp16_fp32_512_64_sliding_or_chunked_causal_ldgsts_sm90_kernel_nl", 165376, 256, 64, 2, 0, false, false, false, true, true, false, false, false, run_fmha_v2_fp16_fp32_512_64_ldgsts_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, false, false, true, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, false, false, false, true, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, false, false, true, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, false, false, false, true, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, false, false, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, false, false, true, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, false, false, false, true, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, false, false, true, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, false, false, false, true, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, false, false, false, false, true, run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_custom_mask_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, false, false, false, false, true, run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, false, false, false, false, true, run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_custom_mask_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, false, false, false, false, true, run_fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, false, false, false, false, true, run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, false, false, false, false, true, run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_72_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, false, false, false, false, true, run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, false, false, false, false, true, run_fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_32_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_40_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_48_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_qkv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_qkv_64_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_160_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_192_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_256_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_custom_mask_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_custom_mask_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_softmax_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_72_softmax_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_32_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_40_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_qkv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_qkv_64_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_160_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_192_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_256_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90_kernel", 82304, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_causal_tma_ws_sm90_kernel", 82304, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 78208, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_custom_mask_tma_ws_sm90_kernel", 82304, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90_kernel", 164224, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90_kernel", 196864, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 180480, 384, 64, 2, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_causal_softcapping_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90_kernel", 82304, 384, 64, 0, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_custom_mask_tma_ws_sm90_kernel", 82304, 384, 64, 3, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90_kernel", 164224, 384, 64, 0, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90_kernel", 196864, 384, 64, 0, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90_kernel", 82304, 384, 64, 0, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_causal_tma_ws_sm90_kernel", 82304, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 78208, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_custom_mask_tma_ws_sm90_kernel", 82304, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90_kernel", 164224, 384, 64, 0, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_causal_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 156032, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_custom_mask_tma_ws_sm90_kernel", 164224, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90_kernel", 196864, 384, 64, 0, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_causal_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_custom_mask_tma_ws_sm90_kernel", 229632, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_causal_softcapping_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 82304, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_160_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_192_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_qkv_256_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_causal_alibi_tma_ws_sm90_kernel", 82304, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_alibi_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 256, 80, 80, 64, 64, 256, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90_kernel", 196864, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 256, 128, 128, 64, 64, 256, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90_kernel", 196864, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 0, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_custom_mask_tma_ws_sm90_kernel", 73984, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90_kernel", 147712, 384, 64, 0, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_causal_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_sliding_or_chunked_causal_tma_ws_sm90_kernel", 147712, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_custom_mask_tma_ws_sm90_kernel", 147712, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_custom_mask_tma_ws_sm90_kernel", 164096, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_causal_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_custom_mask_tma_ws_sm90_kernel", 196864, 384, 64, 3, 2, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 164096, 384, 64, 2, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 0, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_custom_mask_softmax_tma_ws_sm90_kernel", 73984, 384, 64, 3, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 0, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_custom_mask_softmax_tma_ws_sm90_kernel", 147712, 384, 64, 3, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_72_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 0, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_custom_mask_softmax_tma_ws_sm90_kernel", 164096, 384, 64, 3, 1, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_causal_alibi_tma_ws_sm90_kernel", 73984, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_causal_alibi_tma_ws_sm90_kernel", 147712, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_72_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_80_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_96_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_104_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_causal_alibi_tma_ws_sm90_kernel", 164096, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_causal_alibi_tma_ws_sm90_kernel", 196864, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_16_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_16_custom_mask_sm90_kernel_nl_tiled", 16384, 128, 128, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_16_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_kernel_nl_tiled", 32768, 128, 128, 0, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_32_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_32_custom_mask_sm90_kernel_nl_tiled", 32768, 128, 128, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_40_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_40_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_40_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_48_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_48_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_kernel_nl_tiled", 65536, 128, 128, 0, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_64_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_128_128_S_qkv_64_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_72_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_80_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_80_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_96_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_96_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_104_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_104_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_160_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_160_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_160_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_192_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_192_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_192_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_256_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_256_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, false, true, true, false, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_16_causal_sm90_kernel_nl", 6144, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl", 6144, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_16_custom_mask_sm90_kernel_nl", 6144, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_kernel_nl", 12288, 128, 64, 0, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_32_causal_sm90_kernel_nl", 12288, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl", 12288, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_64_S_qkv_32_custom_mask_sm90_kernel_nl", 12288, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_40_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_40_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_48_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_48_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_kernel_nl", 16384, 128, 64, 0, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_64_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_64_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_72_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_72_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_80_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_80_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_96_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_96_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_104_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_104_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_16_S_qkv_160_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_16_S_qkv_160_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_16_S_qkv_192_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_16_S_qkv_192_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_16_S_qkv_256_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_16_S_qkv_256_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, false, true, false, false, false, run_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, true, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, true, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_256_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, false, true, true, true, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_128_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, false, true, true, true, false, run_fmha_v2_flash_attention_fp16_64_128_S_qkv_256_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, false, true, false, true, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, false, true, false, true, false, run_fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_16_S_qkv_256_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, false, true, false, true, false, run_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, false, true, false, true, false, run_fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_16_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_16_custom_mask_sm90_kernel_nl_tiled", 16384, 128, 128, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_16_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_kernel_nl_tiled", 32768, 128, 128, 0, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_32_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_32_custom_mask_sm90_kernel_nl_tiled", 32768, 128, 128, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_40_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_40_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_40_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_48_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_48_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_kernel_nl_tiled", 65536, 128, 128, 0, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_64_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_128_128_S_qkv_64_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_72_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_80_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_80_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_96_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_96_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_104_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_104_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_160_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_160_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_160_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_192_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_192_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_192_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_256_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_256_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_16_causal_sm90_kernel_nl", 6144, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl", 6144, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_16_custom_mask_sm90_kernel_nl", 6144, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_kernel_nl", 12288, 128, 64, 0, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_32_causal_sm90_kernel_nl", 12288, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl", 12288, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_qkv_32_custom_mask_sm90_kernel_nl", 12288, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_40_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_40_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_48_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_48_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_kernel_nl", 16384, 128, 64, 0, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_64_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_64_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_72_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_72_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_80_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_80_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_96_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_96_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_104_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_104_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_16_S_qkv_160_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_16_S_qkv_160_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_16_S_qkv_192_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_16_S_qkv_192_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_16_S_qkv_256_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_16_S_qkv_256_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, true, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, true, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_256_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, true, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_128_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, true, false, run_fmha_v2_flash_attention_bf16_64_128_S_qkv_256_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, true, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, true, false, run_fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_16_S_qkv_256_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, true, false, run_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, true, false, run_fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 16384, 128, 128, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_custom_mask_sm90_kernel_nl_tiled", 16384, 128, 128, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_16_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_kernel_nl_tiled", 32768, 128, 128, 0, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 32768, 128, 128, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_custom_mask_sm90_kernel_nl_tiled", 32768, 128, 128, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_32_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_40_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_kernel_nl_tiled", 65536, 128, 128, 0, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 65536, 128, 128, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 128, 128, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_custom_mask_sm90_kernel_nl_tiled", 65536, 128, 128, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_64_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_72_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_80_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_96_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_104_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_kernel_nl_tiled", 81920, 128, 64, 0, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_160_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_192_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_custom_mask_sm90_kernel_nl_tiled", 81920, 128, 64, 3, 0, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_causal_sm90_kernel_nl", 6144, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sliding_or_chunked_causal_sm90_kernel_nl", 6144, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 16, 16, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_custom_mask_sm90_kernel_nl", 6144, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_kernel_nl", 12288, 128, 64, 0, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_causal_sm90_kernel_nl", 12288, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sliding_or_chunked_causal_sm90_kernel_nl", 12288, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 64, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_custom_mask_sm90_kernel_nl", 12288, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_kernel_nl", 16384, 128, 64, 0, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_causal_sm90_kernel_nl", 16384, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sliding_or_chunked_causal_sm90_kernel_nl", 16384, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 64, 64, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_custom_mask_sm90_kernel_nl", 16384, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 72, 72, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 80, 80, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 96, 96, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 104, 104, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_kernel_nl", 32768, 128, 64, 0, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_causal_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sliding_or_chunked_causal_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_custom_mask_sm90_kernel_nl", 32768, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_causal_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sliding_or_chunked_causal_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_custom_mask_sm90_kernel_nl", 49152, 128, 64, 3, 0, false, true, false, true, true, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl_tiled", 81920, 128, 64, 2, 0, false, true, false, true, true, true, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_softcapping_sm90_nl_tiled}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 1, 0, false, true, false, true, true, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_nl}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sliding_or_chunked_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, true, false, run_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_nl}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 576, 512, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_kernel_nl_tiled", 49152, 128, 64, 0, 2, false, true, false, true, true, true, false, false, run_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_nl_tiled}, #endif #ifndef EXCLUDE_SM_89 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 3fac7ab6e3d..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d611d3b6de51786a670c82f2ee24fcbbb32122be0c2d03acf2f43a0fed5c428 -size 1673322 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 37a165bf277..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1efdf9bfad7cd91fae2d91ca178eac7f4681f0a16ce3b3a3e30bc45f97e709a1 -size 1749098 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 8bb16e2a991..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:726657443ba35eb270669963b5eace32bb385299d4ed754ea925e6ad982e986a -size 1357588 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 73faacc5371..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5af9ec0afda8f44f136078910129e365edc4bb0077d48b78ab33472bc666582 -size 1464938 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 88e75072416..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4832fda9bb46eac65f0d3f5be893656cec0790b89ee8278b2a4f11b0cefabd98 -size 1757780 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index c73e1f4d600..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ad2ccb0b5e920db840f4be6a8073b5c4edae09befcfd80f1bed9190ba251a8df -size 1850132 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_64_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_64_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index aced4f0726e..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_64_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:036a4acef2edb9c1bb3758471d74d3f3a49275e7627b1e4cf88ae7dd9a94afb7 -size 674812 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_64_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_64_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index a1597ed51d7..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_64_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a398a72d3f4a39237a8d924417f66f01267fbc001dc45d403a32417e1849cf13 -size 701650 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90.cubin.cpp deleted file mode 100644 index cafb95723c6..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_160_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66e86d679e8743ed56d1cc5be511e15a0217548d6167e85a553d020459b717c7 -size 531981 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90.cubin.cpp deleted file mode 100644 index a0b1aaea652..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_192_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b7701158a7adbb9b6bb1d86838926ebaa5749b461bfe3adaf5a7e39bcb5a5b97 -size 617229 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90.cubin.cpp deleted file mode 100644 index efbe97afc0d..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_256_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a62fbc666fe22e0a8035bee9bc1261582372b34f0ec9fd7746efad8d3a04ed5f -size 619598 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90.cubin.cpp deleted file mode 100644 index 54813236292..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_16_S_qkv_256_softcapping_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ebd02fc6fb4bb3504625045acb76c9eb3ee3101eb2f7ab7b42640ed86fd1399b -size 429391 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 863ed9ea2ef..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_softmax_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9be9ec3173409c612a33ba128995a3bd4f2ded4b2902273ed865c6a8519810a6 -size 850854 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 1886fa43522..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9cf0a02527b2c15230df359917e11f604c26b6d78adc352dcf7307364c04df7f -size 809794 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 1e864f8becc..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_softmax_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3a053c221a41cfce86287a9420d09f4de1a061edceed662164270e572449bba7 -size 876902 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 6a6725a09bb..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb674ea83851e98f4c218d331133cf9c448ebb9dcc04796110244f6966876343 -size 827158 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 80fc9a54bd7..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:99877cdab237200bc2dfa4350c293d095c0fa8cf2e0c7d3ac846ccd3bba1f9b4 -size 600643 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 5b41968f8f3..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9bd33d90f411a9e3c34d5092e84f33dea8aaed67e4c443577a41f5a62d5c99ce -size 2047480 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index c20f11154c7..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd0f321f087a6cf8c1a1e436e178b924310efb481b0970c96d5d345e491c1577 -size 533549 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 113f0b5341a..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:642254ac102cb530bde2bd1b120521e5684db3a0d263a7dc51064e16e4b2a27f -size 1603874 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index e2d46193c05..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4cb39de2fa2c6d9afba196ebe17192ea69a702e2a77e844df9d8a5cb76eec46 -size 535129 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp deleted file mode 100644 index e4aaa5ce702..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9fe539ac354af31bc56f806b3e96d137d28b573d05177f48ba783958a9203b54 -size 1606242 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 5f5181e1a4e..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba4ca04e00d3545fc4444a9e2826ccf5b2d87feb2efffa593c7c6c76096f6240 -size 536707 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index c7825385f8d..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6d4f4c92d2a3b3cfc05f6de76b71a546b9ad701c0535aee33d2fe38f6e3306bd -size 2116152 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 325d1a2cdc0..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e292dfc076cb2ba0e5e069c5604350a7610d555f3addf1f889af486ad3497b5e -size 573791 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 5cdce6f2d56..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:15f081c68681a54bb112c7e8202039eb6488803392f316930d867d017d9c3265 -size 1937748 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 5236be45cc3..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:109b9e58496def51f9d228b2c3ece5cbd147100ce559829a322207aefcbe001b -size 507487 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 334cc31ab61..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff5aa5d4f8a950da4651817fb86e269491c9fb5524a38cbf4c151e5dab1b4a8c -size 1514666 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index cd6d23f1249..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0a5aad225d22c9eaa89a8e2d14349fd6787496178789e3dd0de9d3a879e8493 -size 508277 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 911f97d1a1e..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c1b7c60ac63a0ad08f1a7b6889fb81a79278bde0b0944293bbcf9a26383d2bd1 -size 1517034 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 3d2b9a1fe90..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5db201fa14cc3f1b8aa38cafd4a77e566d26e6949760236ad7d5b62ae010a841 -size 509855 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 3fb2212af9c..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b4ddaf9f331e63c569793a92042c745e02afd54f1f0eef8b54d501ddfe241d8c -size 2004052 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90.cubin.cpp deleted file mode 100644 index e5f7f3600ab..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_104_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4dd690d4ab9f13c252441610922bfcbe9e46dfbdaee9fef1a74b9c4d5dcad044 -size 484621 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90.cubin.cpp deleted file mode 100644 index 8d77ca41328..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_40_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:71be5d957122dd03c138303f1fabe808f5a3a9ae48db74d5b8c620f94711813d -size 374901 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90.cubin.cpp deleted file mode 100644 index 9fc110a8016..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69f0a060045dd6bfae78c6f781c665a245aa8ebd47d41b9e9cfc73605dcb846c -size 374901 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90.cubin.cpp deleted file mode 100644 index 9367de223d0..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_64_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c0ec7144ce7e4e2bfa669ff711dbdc35d00d0869c98f2c6a79b75b64f072c88c -size 508299 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90.cubin.cpp deleted file mode 100644 index e057d893d0b..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_72_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f5fca0d3ed98ac3602dc662bc5ac4c2ed09fb98444e877de646f684e230fb61 -size 568287 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90.cubin.cpp deleted file mode 100644 index 56aeadf42a3..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_80_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ff05bf07b4f7b66ef9118a35e935ac99a57156fe66570a57ccf68aed167bcd0 -size 438837 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90.cubin.cpp deleted file mode 100644 index ff141746357..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_96_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:77896a1ba564458be32965e7890401e6dd3bf5a651abe715151124793dc94dab -size 457781 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 90cc7121660..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8bcb0c931ecfa429d8414e81ab67be7998d743233c8ac5468777b2b2486f6f72 -size 347267 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90.cubin.cpp deleted file mode 100644 index a7dd8015066..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_160_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1017f8b842402ad13024d48a9db2170dc9e0ad9d1a7124a8423db2144e51087a -size 1129486 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 114f6264088..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23b6e03799d85f1393d26920a68c2ef019d9cf16f4d9e90d481b318a6e3a7322 -size 350425 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90.cubin.cpp deleted file mode 100644 index b03206aa3d4..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_192_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:78451581883d22b0da7e1bc48ec5b4ee50cc084e84c92634382c6e5e03c354ca -size 1138958 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index c0be4b59a51..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b4f514c2f454e1626613bb16305bac3ade0d7a9c04c4cf682fa8ecd408e5564a -size 356739 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90.cubin.cpp deleted file mode 100644 index cee1f4ea4f6..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:467ebbd5d13580f51f4a2b4927b5f4b59f751a65343367484aeb92e31d9f198f -size 989008 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 4a4013638e3..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_256_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3efb985a6cda33081ce2d8c5aba151aff8bc40f7d5ffc1badc7dfc3d7ac43dd8 -size 1157902 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90.cubin.cpp deleted file mode 100644 index 871bd0f8307..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18a86b28634495fe8507eebeb58cf99d1b4a63ef73c0cc57f2b756169a5acd8a -size 292045 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_160_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_160_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index a91f5319bda..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_160_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f5b4ba213cb8647fbbe5c6b1d712ac87dc32fa6e84b9fec65159dbe713bddea -size 361461 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90.cubin.cpp deleted file mode 100644 index d8943cb13b8..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_160_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0a77600e552c6c5fd3453fa1cd1e8fd4b9afa1886efbdf098d61b86b811f9808 -size 1174464 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90.cubin.cpp deleted file mode 100644 index e8f5b74f725..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_16_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb075865dbe00e8dd3d3c78fd6e00b462a90aa56ee55672e5e287d5a7c14c6ef -size 398581 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_192_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_192_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 1c4de416544..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_192_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:22872a0a24c70f7de9b4f22650dae6148d8051ecfb3698e869a0c72939f8bc59 -size 364619 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 31e33e1328f..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_192_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab9708ffc243c7a4e3009a8c03631d327516eab9a929be32429fba8da10d07bf -size 1183936 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 16c715e0aa4..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6cc16eedc41bed12c9ef16317f40dd0740ce703a24fa0a4f8f9ebaaa7e545c41 -size 370933 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 99e7c8fc7c4..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_softcapping_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e3b20043633c9e9e00c365ca193b76a8dd0c5c00c50eb574de90c031fba4500 -size 1020568 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90.cubin.cpp deleted file mode 100644 index df3ad0a894f..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_256_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:76df46781cfabdecd0152baafbcfa8748385f965d604b3bacd1fee38fe13fc8e -size 1202880 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90.cubin.cpp deleted file mode 100644 index 4f994b42322..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_64_S_qkv_32_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7630c248ce62fe373b2408f6416ee5dd0e8e7b788e60e9c3012fb1cab5ac020d -size 571445 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index ac0c24b3c59..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b65809fea52dd121c70ebf030b85accd8664f4bdbe77a7d81c170131869bcaca -size 876098 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index a847551fbcf..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:22ddf513e59df0d5adb3c32ef1152d85db85c9df9101b6d09797191236dfd9b4 -size 904514 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 60522ddf1f5..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36d826042260c7bcc6c6d63a738221ac63f9cf367d83c2f91719e94259906e1b -size 571439 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 62bf48cf975..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6acb2177482e0c487d86a2bb7bc828ce2827db6bdf19ba5ae6d57f4fe25fab3 -size 1817786 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 43aeb6d1725..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c55b93e19ab679a1cd8e29815562eb6501042a7e89ec522a3543db988c3b23e6 -size 631428 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index d2fe647ea00..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f27f722bd64d87e19177d2b96acb1b9758becfab7d68134f0eb4c0db5365e67c -size 2221134 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 367b864114d..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c2d9ee842746f90064db9df90edd5e836fc85285d9337cb212dd2badfcd6a16d -size 551705 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 69fa6c2dfc3..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e17e6784a18d982d265991b90e3cf3545b187e153780860174d60f60245ed91e -size 1731746 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 0295f0b809f..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:33321ecd60da6623cfa0b30a47abb59a4432e1e21a026471356a78e04e8d0c7f -size 572227 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp deleted file mode 100644 index e1870db357a..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5053ac795d563b685eddef61d06c54a386db5d9a27ce839298e3f11c50a1a8f5 -size 1749902 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 0d09509c69e..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d978493d34382435a184bad11c3f47adb9769288f43dec06a05d9df08ba3335 -size 571437 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index f1d5575b460..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f38abf26bdbaf3baff1891f8b9dbc7c8a500f10be470586a6a96e74e1c9c0f39 -size 2298488 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 1e81e0df593..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d3804a83b382f9a4c315009232d7a7cca373c66b1e62c6c1b2f989a056a2c255 -size 589593 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90.cubin.cpp deleted file mode 100644 index c25a80cbd56..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_80_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0e19cb26c679715082ef2c4717a6a6997c94ff39229ff1ed055eba9126c82255 -size 1822520 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index c4af22b500a..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:95987139dc44e4a14a3539e0fece21952b03a98e0a9946671aac0d6f961c8655 -size 589593 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 21a422256e7..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_96_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:976986cae8ce183e425094d22d1b83c086792415b69244462ba89ac097d3e6a8 -size 1826466 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index c6d83f6ce16..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:642194b891b357a64df841bc23bb7dc4ca39a9d8b5a9d8c7c4c58dd6500f3482 -size 546167 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 16ccfcd2075..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d3c760d7ca2abb4aa58a7f74fa9973cf5ca6cf7e44ac667609d9c9a1d1f255a0 -size 1727788 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index c25181191e5..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da420de0e2ef914491faf4702be9fd7f5b869ac1d34d2464cba5f7f695519cdd -size 605365 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 2e8b3892777..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3bf8a3c1ffaa246f4544b287d7e41a1ca1ce09f546c7c6be6a5a9456f197a18c -size 2093248 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 88c14e491c5..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51b1ab0320dce8ef43366704d881f771ea91983c89e501e9b38bc2de050d15f6 -size 524853 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 1effe93f2a0..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fddabe0f32ef69ab1bf2b03b1847d40a85ef60206e38e4f67190f3254b7f345d -size 1637802 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index fcf65bcbb92..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:826fb7f51910c1a40304bacb9593c907925179eb5fb9562072cdebd3cd66c0a7 -size 546165 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp deleted file mode 100644 index f07159d15c8..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b635aed4c7c83044534a3238921c9154d4fc0be7334483aaa4d27168fcb7993 -size 1656746 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 0e164f78dcc..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10f3ef268d6727c335c9a0ea579eae37bdbc7ff88474dd9eed1a20ff327aa6fb -size 546165 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index f207994a077..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1887dffa86c664b4b4e8f511cbfdbf8fa1417801b043f4b3460e67d5e6aaa472 -size 2174548 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 96399194733..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9200f916de7b702d133b9bd63e32e930f7c14aba998e30f755f6ea31543e2f99 -size 564319 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 8fc1400e81e..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f4aaf2d1242fff429d3509fb948b966caa6563fa80c47ec3312dbecffffb8e0 -size 648822 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 15e78341171..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:351131682a73ad48c77bd86f63f243ff3f4f523bc57982f5b3afc73538598a2b -size 1734100 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 2b3a950183c..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:929d7e212324766b8af927e31f49727a009da195d8b1166a14c4fb8b43cd7668 -size 564319 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 3f1a11d1fdf..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_96_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:97d0342ca64088ae2a8baa512f3c0396b0485bb6e8b9d9d53cb4d5945c8bfda4 -size 1738836 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90.cubin.cpp deleted file mode 100644 index 2d035debbd6..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_160_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc1c0f0b14b9f690bd3260906378ddcca79f69b51f0ba32ec324def44b0fb5d0 -size 476727 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90.cubin.cpp deleted file mode 100644 index f038140061e..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_192_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bfe782720b0f0a9d6db911f4650be127d681a698759b845061ef1c3fb9b9c433 -size 509879 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90.cubin.cpp deleted file mode 100644 index 880eaf463c4..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_256_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b8367b6af9f8c67dd9acbc3d318369ec2f9bc9ded6545c9ffb35ca3787c089f -size 541453 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90.cubin.cpp deleted file mode 100644 index b8e146bdad6..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_16_S_qkv_256_softcapping_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:069c9ea47d4d6a38180507235a16e256d7b34bb598d1584f02b1539d1862f287 -size 373349 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90.cubin.cpp deleted file mode 100644 index ac0da2fe8f8..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_softmax_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f55027948fcc281ee08a931b0d5b3c56ec60092219af03ca41dfe01a1028253d -size 946364 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index b88002c3553..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aa8a8b105b52513eccb5b31f7ba547f27fab6435862d480855d7f44568870df1 -size 906092 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 8038c72f383..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_softmax_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70fd78a5e610c315e65ca79f94c1815b791cb56f3632bf40eb8271fcd3d2c0a7 -size 967676 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 57f6fd12b70..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0a57f0015f870af927908642ccf30fa0b15adc0d244179907eb73dabc4507110 -size 922668 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 871bf70a5aa..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7399353d80b7fc39e0f55007dc9052561834adcf850bcf7d42279cd490669726 -size 623534 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 67c4bce7f02..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b1e315e4981c7b913927391905360ccb9d488215f73be72b1086f56831650eb -size 2271650 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 7ef280582cf..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c815751eb753d84d127a169504787d467bb3a20e49a60c1939a1143132a9836a -size 627482 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 9cdae05b77d..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ed5ac70c4f0b560efca8060ebf136140b88e71e6fb33aa6f9b4b8e405456229 -size 1742798 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 8baa1fadc53..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fea15f138d7e1d23cafa9ca6b17ddf00ce52b9df20478418c199901dccb15ba8 -size 628270 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 59942988da1..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b428fd6b488d0e49b8ce7fa1c3a959f050f26ee36e3f0ff3db533a3146fa81c6 -size 1745166 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 015db16e362..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4af64477de61015383920cde09c5ce9549b389892ef9ae3e2f6efb26314f72cf -size 629060 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index e67e7b256e2..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f8ab40a3a926520fe3a101bdbb2d58b00076733e4ad21c4ebb177ed4effbfd9 -size 2306382 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 995ac6c7c75..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:37da64afd0a84e4a6f0fba0a7091e2318d8f9a5dc25617cd6f325ab4342fc525 -size 596683 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 4e1a130c5ed..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc4b148c9a971a1194e0aabad82b383ee88e9272b7aa20b601ee85fcb9fa2c88 -size 2176916 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 96e5f7f3554..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da64ec677127bfad32d00386e087128d6a14843aba37222688e19fedcd97f311 -size 600629 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 15825b6bcff..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6bf93b0226a1818ba1995dd8c85ad4ec29351ba212e5e49b594e8b0df26045a1 -size 1672532 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 200c0da6622..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cdf008e244e906920a14493d475eb6e62456b868fb987de16d800c32b91c10e4 -size 601419 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 0374b8cd87d..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:22781441789b5f75434fbc8af159613eaff14d6924f60885daef333f858ef7ce -size 1673322 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index bf4b18e3eb1..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:efabdb51896a7643b7977a685ece8cb0f67606080dc6053ea0d2f3fc1cf66968 -size 602207 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 7f548dff6b8..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6ed306cd7286c68c7606eed82f3af0bac1a516dbbd2e068532e27ebceabc886f -size 2215594 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90.cubin.cpp deleted file mode 100644 index 2b3deb2f830..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_104_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3372ee5dba598687f9ff399a2aac69d3fcf294f9eb636c82ca5e00695b32c556 -size 449101 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90.cubin.cpp deleted file mode 100644 index 16a807e89f0..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_40_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:390e043488e3dad8e99e25ca29b55e8428d03a77869bb3e6af1ac1aacb3b3511 -size 368587 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90.cubin.cpp deleted file mode 100644 index 74596b3bd89..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f458032fe71f665e19f3257324e74f5f995c4dab61450bd980c7b74bb3abd817 -size 368587 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90.cubin.cpp deleted file mode 100644 index 1763278fc46..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_64_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6461d7e5d38b420756387b2ae9a62c5f2f0e7d67f9af087b132222fdffa5414f -size 498037 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90.cubin.cpp deleted file mode 100644 index bbbe09b0f7a..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_72_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7037c998bcc93018cb4e99dd1a48937faf85d1e903390b0cbc295b9149dc21c8 -size 550923 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90.cubin.cpp deleted file mode 100644 index 777a3f3583d..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_80_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01cf622dc9c60286aa88dfac98ec58228ab4ce474d7592846003b082691dcdc5 -size 426207 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90.cubin.cpp deleted file mode 100644 index 055f9a03161..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_96_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:378dc140ed098e75bf285d93c91b6a087280cd89fcde043fb3db55c2b26833ef -size 436469 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 95d7e62bd7b..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:94a26e56afb6e4a4b05e1a74ad183299c7ac9555e7baf09b3c76581eedec493b -size 331481 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90.cubin.cpp deleted file mode 100644 index d2b30335822..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_160_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b3232cce8f8253cd19406ae0c8dc68fed2cbeef0dd2238c9afd75d5f01fabbdf -size 1090018 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 9b42986d453..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dcde3d03c80cdf0bb932b7bb92c116351fa3049d8130f91fc553a5cec176ca19 -size 333059 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 114fa712119..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_192_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:55dd687393eef001102f5ca2488d23e7366ad1e1e5ee3dd41f7da55dd4eff7df -size 1094754 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 357540564b5..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c90df0fbb2572225d63e6fcda208180096a0199dab508b5096f3b375eb7574b -size 336217 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 0509b746c57..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d18cb19c00823de28731a60f93b4ef7ce6a3384cbdf52aee63a83befa545490 -size 950330 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 70312d09488..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_q_paged_kv_256_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d0a80df9f47339d8e6a22b07f9da2fd4eb5bd2e9779b7c38d89c989e2837c7ca -size 1104226 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_160_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_160_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 97e5b6aac78..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_160_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:48631e81293357f0b01565c88eeee385328dd522dd9a5a580d28d0c3b7c6ab60 -size 353567 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90.cubin.cpp deleted file mode 100644 index cc20609493b..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_160_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f9f20888eae36f145023ecdb73b6d5023cc812142669f8d18a2d745985060d33 -size 1157098 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90.cubin.cpp deleted file mode 100644 index baf7b160f60..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_16_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a26b83da52c6ae5c2b4fbb318e5219005dc4d35ceece19b8884d31ecf36634b -size 404895 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_192_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_192_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 4576f4b9553..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_192_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:26b55c91e24e84845f2c90e03367f1d77aab1eff8ff7633a448934822f82994b -size 355147 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90.cubin.cpp deleted file mode 100644 index e7fde9dcc76..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_192_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1b71ef82b511c6073e4ae6f9b2b410237a2522c6b925742d5384b503b9dd214 -size 1161834 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_256_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_256_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 82ad108a721..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_256_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c032ac3139498f8f5fa1e242e34408a6d994d77d80affdae556957d91b4a42c -size 358303 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90.cubin.cpp deleted file mode 100644 index b7a387a82b1..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_256_softcapping_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:55d8a9c42be2b7b3a5b4797ef5b0012010fdd349e8065f73a274e0841df148c8 -size 988994 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90.cubin.cpp deleted file mode 100644 index a93203deb29..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_256_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5533b93e67d55e4b264530bd511195d716c93716f918e1a93469c62871d6198b -size 1171306 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90.cubin.cpp deleted file mode 100644 index 2356aae4308..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_64_S_qkv_32_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc8497badf731be36e4deaa94c57500c06231b2706be3b33d47ed4cf78acfe1b -size 571445 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90.cubin.cpp deleted file mode 100644 index 3d392ba0f29..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_160_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f25e7c84b078018a49282c2d2bd8a1e1149301250e2e31a0ce49f717c9b2f23 -size 533569 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90.cubin.cpp deleted file mode 100644 index 774503d7fc3..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_192_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a64705ebc36da535aaa671dcaf5ad1efbc260e5bd206de9e01644f1f9c87d83c -size 618818 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90.cubin.cpp deleted file mode 100644 index c16147e5e57..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66d5fa0371d74983794c1a2893e8c566d4ab3d342e503cd5d2cd8fdad8eaff73 -size 621186 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90.cubin.cpp deleted file mode 100644 index 7459e0f169a..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a21e56d36e2ecc6d9e15fb93fbab5f05ae8f0495cb74a000193c30d2a95f27d0 -size 430191 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90.cubin.cpp deleted file mode 100644 index b62c57b6164..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_softmax_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a3777a88ed27a6dd8488a1371e1a145ad519a20ef755ecef69859d9bda321e5 -size 853232 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index fd1962e65c0..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:08512893e80bfda1b45ff37252419c600f07530e70d803f53da3ecb15d90dce4 -size 811382 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90.cubin.cpp deleted file mode 100644 index e4454bb3fdf..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_softmax_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0a271e19cda624748e4be832161d927cdc1ed91f11498d7138348d1d78b5598b -size 879280 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index d0557158f0f..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_kv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:16841939c0376ec68e9cf198d7c010165f0c3e98c994a662cda6a0486c902d70 -size 828748 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 30122547071..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18b68494a11359db447783f01bcc73aa1978322f2e3e002287fb949775da5c3c -size 602231 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 2137aac324a..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb33ef209555c610f58be1cb2daeaf6b760d80ee16b92f2911ae8b30bc766057 -size 2049858 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 0637b997cb4..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:77d28aa4fccdaa59d13ce6096ca5913ada72a899e9434d05f688133a5c8f22bd -size 535139 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 401f8d33283..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e9f78ba194c5d7d4c928a90f13d50c4b48fcef0533ca129a48f85e6f24506d23 -size 1606252 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index d278b2d3c49..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4cf2053691228985b29d62ab2b463ee8a2410f1118d4bf69383a99a5e694396a -size 536717 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp deleted file mode 100644 index e3e5807a2e4..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b22837f8edf175ea3758b12164767eae7db34156f63f400ab60af205e439cef0 -size 1608620 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index ce72f744051..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba369586e8afd6c8af89d64517f8fd6c64c398e0934d8d48be3e601872b192c2 -size 538295 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 1a462efa1b5..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:986e435a32bf7b7a83d91db412dac096c4b7792858bb23ae4137e20d33e3f4e7 -size 2118530 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index c9b515cea75..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3b5527deb3d6099119b2a1c90b43a9ed6fe15fddb04e1a7f3bdf967c1fd4337a -size 575381 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 238c08d3969..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b325b76337dda6ebea7646122d66bf286b7916095ae48377bab4f363347f1409 -size 1940126 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index c015554ea2c..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8e2cad8fb3c2554b71e8a5fbb8f830f80bd9c92e06342be347e65ea75711e081 -size 509077 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 720a0bce388..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:239b45942e6c4013bbdf541461e106ab841e65295f14c7c2d88eafb2fe3b1bd0 -size 1516254 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index d1e4062e9c9..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:df31c078a33ab3162d1059bacaadec8ee85228851b7c0af55642e93fcf5e25ed -size 509865 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp deleted file mode 100644 index a45e50ed00e..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f1ab691124894776107816500eba3b00eb0a8eaff9ad4aec168643fa1d5a2f87 -size 1518622 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 0995d7e4bd5..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9baa9852accf04f335f8109553ae2e449d3f80275695a3b943025bcf101c60a2 -size 511445 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 96514b4e33d..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_64_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:19fe04ebe55af73b6af7affd7ec3f4f9985e30a3706aeeb4c38228145964f056 -size 2006430 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90.cubin.cpp deleted file mode 100644 index c836001eca7..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_104_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2714acc9a4694d195cdd02ab2fc64eac74325ac7d344c2dfd9a6fdcdf38f4d03 -size 486209 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90.cubin.cpp deleted file mode 100644 index ac78ba993de..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_40_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf8b3617ee5fee5dfa0e925b517dcd6e0d201c6c2ce80c7aa7e03e984cbefce2 -size 375701 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90.cubin.cpp deleted file mode 100644 index c6c63004117..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_48_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d74e562e010009fa44f648fd2029737ef137c5a089697e5aa2c3cbaa60719e5c -size 375701 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90.cubin.cpp deleted file mode 100644 index 2ae58c9bb2b..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_64_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b56e8f68fb2fade1bad5db4bc54fd3ad418f38ecd7673b61b09f409487e265ed -size 509887 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90.cubin.cpp deleted file mode 100644 index 70da5b81822..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_72_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:773e0f6d1d825ebce1da75ad1a9b427c04d698c9a6d4838076004190580ba79b -size 569877 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90.cubin.cpp deleted file mode 100644 index 905df4bb255..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_80_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d1eb5e92782ca8e5a3df8e302571783ee9ad6beb18b791e4dd8d3d5ab3be1dda -size 439637 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90.cubin.cpp deleted file mode 100644 index 025f8acc64f..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_96_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e7fd66816dc1f47f025d606bbb45f3e94f838ce43b7b40a4f9ea052029d0870 -size 458581 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 76717e5f87c..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8d75f35ab03ffa3ff5ffe11066bbdaf95552d3e23fdf15f2b9ee458f98c420c8 -size 348855 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90.cubin.cpp deleted file mode 100644 index bf8d749a678..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_160_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6a70b03a2580c2ffbff0497bded3a1a7b941b9347b8901e160aa2e73eb646db -size 1131864 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 4adf5cd0bc2..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bfc986e34310721e5f3068fee18e2c82ba50a44c291294a06c9de7fca62e9b1b -size 352013 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 6f7acd3264e..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_192_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6454f4d1e3ca41c8189e6e0a9a80b7fb3e37b665cd0107e14a342b2be19b790f -size 1141336 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index b51800cb52d..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b40add14bac5d856a496a64325d00897ea76020281c5bdb1d0d0c69bdbd9a14e -size 358327 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 6dab05e6bac..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_softcapping_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:74e4e84f9d13d4781962db89e1bd88b4228f83e9399943ebf5e189efdc751f63 -size 990596 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 504929f757a..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_q_paged_kv_256_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bcba8f07759f445a389837a85de7cef90814c7f43415ce01f33083fcd9f8eae2 -size 1160280 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 36c41d3437d..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af79b6bf63752c315ebbb3371b089846d0d2f5feb8744fc995144e71024a02c6 -size 363049 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 0994e019344..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_160_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d0e5c891554697129b8b38a4eb59c0459f5267539482b561cbf82a16294d899 -size 1176842 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90.cubin.cpp deleted file mode 100644 index ef409f6d74f..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_16_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:09c50d96c8d82d3cad9fb1d3aa5f178d5136d2069f3908727a5e7bb6ed927339 -size 399381 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index f662c1bdfc6..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d7ea7ce5d543dcc86daf4e1c4a9adba10a208375c30aec13f84a428ec522179e -size 366207 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 529ffb27906..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_192_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:408cc151f0b72313351b97bbc6b2c05d94ccda42d4cb85044d3ba80fe4174fad -size 1186314 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_alibi_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 36fbb5ad56a..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_alibi_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5b268e9bc4735c75f990a4bb713b37760e3f7701af1d788fb1b49cc9f042ae71 -size 372521 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 44097768adc..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_softcapping_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:280690085e1eeabf5a2fa9370e8b45ea15f409936e6900edce075e81929576d2 -size 1022156 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 35f0c1ab414..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_256_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cf689aaea749d65a25eed4d7fb2b670ad02931b40749b8c7bb8c3bed2b847b8b -size 1205258 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90.cubin.cpp deleted file mode 100644 index e5649ce8e7b..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_64_S_qkv_32_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c73631df3695b5559cfb437f2bef3f6dc6c3b2396ad5bbb3524c8190238f3709 -size 573033 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_256_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_256_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index b346fb8b978..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_256_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36fb9bbc2236d32b3b750a8b328ac38937038d7f581a8daa24858ecc26587505 -size 1664638 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_256_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_256_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 6d8ceca0cb1..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_256_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1c54f77b78445c1f3463240e8424757b2a4815a3b1bd94e86a2268d36eb9906c -size 1757780 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_384_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_384_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index fcb495fbd67..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_384_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6cfaf0c44cee0824c3bda81ce3ae23f578d33dcf94ef3f831fe8152cd191c9fb -size 1382058 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_384_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_384_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 6035d87f56b..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_384_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:784a9abde49c19009642f5091869d44f5c00a605718dbe72eebed9df37b75d16 -size 1453886 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_512_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_512_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index ab01c9939e5..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_512_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:96f21c237226da6eec97209043660b7b41fb9e840aa7a2085b4c9de655ebac9a -size 1715946 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_512_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_512_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 14fc09c9c51..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_512_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d1627e09a435816f8b491e710e44d6ff024ff40428c78cb7be691e7735189e09 -size 1816190 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_64_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_64_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index ae7555d9a35..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_64_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:208aa31e01d671d97787e261954daa99da51e05cac5106fd2da86d71649500b6 -size 679548 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_64_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_64_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 854ebc6a6fd..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_64_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a38d4ab436b225a9e710b4d53cc4945f06ffb0ce4e606c5493d8ca6d35167468 -size 701650 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_256_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_256_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index d58fa749d36..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_256_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18e9e81ffc919bda7d1dd4ec688c6b7fb1a26868459131f11624175f7e50d862 -size 1675700 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_256_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_256_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 589c6d0a0f7..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_256_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2cb36bde5da4deb3811f99f1dcac901d0c32a4076a56d9d3dc75ae5ee4d47dbf -size 1751476 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_384_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_384_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index d668ccfb846..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_384_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d5acad3f47c50696255e68904eaa0b683eb8d59d9b991cc835dd5eeb11b59ab -size 1359966 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_384_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_384_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 4753a4f58b1..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_384_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9bd9e0abc560102b18ac833953af11f4e22236697f6471de22e52438be286391 -size 1467316 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_512_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_512_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index b629ac5932e..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_512_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:764a4ada9a55e899cea280f2a533a64687c3495bbc4dbabcfe38a7b679b3a223 -size 1760158 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_512_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_512_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 5b449257826..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_512_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4a34af1c977c6d5def822602cce9bce54e23de2679efc7d4d0f58bd792fe7d49 -size 1852510 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_64_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_64_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 949cb41021d..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_64_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d92bfca6c7b882bc63f401f0ec5ae0214b5241202a09710d8adfe57c15772e0a -size 677190 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_64_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_64_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 0022c8afa44..00000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_64_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8e695f14d955749ceeadc909eb7480c609e7cbf24a5f22afcb41eb74f2efedf8 -size 704028